diff options
author | A. Jesse Jiryu Davis <jesse@mongodb.com> | 2020-06-26 17:04:01 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-07-11 23:55:48 +0000 |
commit | 74ab0cdac56e1cc62fc96d3ca3be3ddfa54f2bcb (patch) | |
tree | 77d0347dc607247ac8ec1c64ee7e593de3fbbeea /jstests | |
parent | 4b918b3f7693c42ad74dfa5030e7cf2ffc3d1ddb (diff) | |
download | mongo-74ab0cdac56e1cc62fc96d3ca3be3ddfa54f2bcb.tar.gz |
SERVER-48518 Fix rollback via refetch anomaly
Diffstat (limited to 'jstests')
4 files changed, 241 insertions, 3 deletions
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js index 7ab5068aa97..b4420dc67b0 100644 --- a/jstests/replsets/libs/rollback_test.js +++ b/jstests/replsets/libs/rollback_test.js @@ -429,12 +429,46 @@ function RollbackTest(name = "RollbackTest", replSet) { }; /** + * Insert on primary until its lastApplied is >= the rollback node's. + */ + const _awaitPrimaryAppliedSurpassesRollbackApplied = function() { + log(`Waiting for lastApplied on sync source ${curPrimary.host} to surpass lastApplied` + + ` on rollback node ${curSecondary.host}`); + + function lastApplied(node) { + const reply = assert.commandWorked(node.adminCommand({replSetGetStatus: 1})); + return reply.optimes.appliedOpTime.ts; + } + + const rollbackApplied = lastApplied(curSecondary); + assert.soon(() => { + const primaryApplied = lastApplied(curPrimary); + jsTestLog( + `lastApplied on sync source ${curPrimary.host}:` + + ` ${tojson(primaryApplied)}, lastApplied on rollback node ${curSecondary.host}:` + + ` ${tojson(rollbackApplied)}`); + + if (timestampCmp(primaryApplied, rollbackApplied) >= 0) { + return true; + } + + let crudColl = curPrimary.getDB("test")["awaitPrimaryAppliedSurpassesRollbackApplied"]; + assert.commandWorked(crudColl.insertOne({})); + }, "primary's lastApplied never surpassed rollback node's"); + }; + + /** * Transition to the second stage of rollback testing, where we isolate the old primary and * elect the old secondary as the new primary. Then, operations can be performed on the new * primary so that that optimes diverge and previous operations on the old primary will be * rolled back. + * + * If awaitPrimaryAppliedSurpassesRollbackApplied, insert on new primary until its lastApplied + * is >= the rollback node's. Useful for testing rollback via refetch, which completes rollback + * recovery when new lastApplied >= old top of oplog. */ - this.transitionToSyncSourceOperationsBeforeRollback = function() { + this.transitionToSyncSourceOperationsBeforeRollback = function( + {awaitPrimaryAppliedSurpassesRollbackApplied = false} = {}) { const start = new Date(); transitionIfAllowed(State.kSyncSourceOpsBeforeRollback); @@ -498,6 +532,10 @@ function RollbackTest(name = "RollbackTest", replSet) { lastRBID = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; + if (awaitPrimaryAppliedSurpassesRollbackApplied) { + _awaitPrimaryAppliedSurpassesRollbackApplied(); + } + log(`RollbackTest transition to ${curState} took ${(new Date() - start)} ms`); // The current primary, which is the old secondary, will later become the sync source. return curPrimary; @@ -550,6 +588,10 @@ function RollbackTest(name = "RollbackTest", replSet) { return curSecondary; }; + this.getTieBreaker = function() { + return tiebreakerNode; + }; + this.restartNode = function(nodeId, signal, startOptions, allowedExitCode) { assert(signal === SIGKILL || signal === SIGTERM, `Received unknown signal: ${signal}`); assert.gte(nodeId, 0, "Invalid argument to RollbackTest.restartNode()"); diff --git a/jstests/replsets/rollback_via_refetch_anomaly.js b/jstests/replsets/rollback_via_refetch_anomaly.js new file mode 100644 index 00000000000..1a3f192e498 --- /dev/null +++ b/jstests/replsets/rollback_via_refetch_anomaly.js @@ -0,0 +1,194 @@ +/** + * Rolled-back documents must not be visible when querying a recovered secondary. + * + * This test attempts to reproduce SERVER-48518. In the following description, 'T' is the tiebreaker + * node, 'P1' and 'P2' are primaries in terms 1 and 2. + * + * - Begin in RollbackTest.kSteadyStateOps with primary P1, all nodes connected: + * + * T + * / \ + * P1 - P2 + * primary secondary + * + * - Insert _id 0 into P1 at timestamp 1. + * - Transition to kRollbackOps by disconnecting P1 from P2: + * + * T + * / + * P1 P2 + * primary secondary + * + * - Insert _id 1 into P1 at timestamp 2. + * + * TS 1 TS 2 + * insert 0 insert 1 + * P1 ---------------> + * P2 ---> + * + * - Isolate P1 from T, connect P2 to T: + * + * T + * \ + * P1 P2 + * primary new primary + * + * (Same as RollbackTest.transitionToSyncSourceOperationsBeforeRollback(), except do *not* trigger a + * stepdown on P1.) + * + * - Step up P2, which writes a no-op to its oplog at timestamp 3. + * + * TS 1 TS 2 + * insert 0 insert 1 + * + * P1 ---------------> + * P2 ----* + * \ + * *--------------------------> + * no-op + * TS 3 + * + * - Delete _id 0 and 1 from P1 at timestamp 4. + * + * TS 1 TS 2 TS 4 + * insert 0 insert 1 delete 0 and 1 + * + * P1 ---------------------------------------------------------------> + * P2 ----* + * \ + * *--------------------------> + * no-op + * TS 3 + * + * - Reconnect P1 to P2 so it rolls back. + * + * T + * \ + * P1 - P2 + * rollback new primary + * + * Rollback via refetch undoes the delete of _id 0 by reinserting _id 0 in P1 with an + * untimestamped write. (It can't undo the delete of _id 1, since P2 doesn't have _id 1.) + * + * Before we fixed SERVER-48518, P1 served queries at lastApplied = top of P2's oplog = TS 3, + * which includes _id 0, _id 1, and _id 0 again (it was reinserted with an untimestamped write). + * To fix SERVER-48518, P1 won't transition from ROLLBACK until its lastApplied >= max(P1's oplog + * top, P2's oplog top) = TS 4. + * + * - Write to P2 so it advances >= timestamp 4 and satisfies P1's conditions to finish rollback. + * - Wait for P1 to finish rollback and transition to SECONDARY. + * - Query P1 and check that rolled-back records aren't visible. + * + * To end the test, RollbackTest.transitionToSteadyStateOperations won't work, we've diverged from + * the state it expects, so we end the test manually. Reconnect P1 to T, enable replication on T, + * and stop the replica set. + * + * T + * / \ + * P1 - P2 + * secondary new primary + */ + +(function() { +"use strict"; + +load("jstests/core/txns/libs/prepare_helpers.js"); +load("jstests/libs/write_concern_util.js"); +load("jstests/replsets/libs/rollback_test.js"); + +const rst = new ReplSetTest({ + nodes: 3, + useBridge: true, + nodeOptions: { + enableMajorityReadConcern: "false", + setParameter: {logComponentVerbosity: tojsononeline({replication: 2})} + } +}); + +rst.startSet(); +const config = rst.getReplSetConfig(); +config.members[2].priority = 0; +config.settings = { + chainingAllowed: false +}; +rst.initiateWithHighElectionTimeout(config); + +const rollbackTest = new RollbackTest(jsTestName(), rst); +const P1 = rollbackTest.getPrimary(); +const P2 = rollbackTest.getSecondary(); +const T = rollbackTest.getTieBreaker(); + +jsTestLog(`Node P1: ${P1.host}, P2: ${P2.host}, T: ${T.host}`); + +let testDB = P1.getDB(jsTestName()); +const testColl = testDB.getCollection("test"); + +let reply = assert.commandWorked(testColl.insert({_id: 0}, {"writeConcern": {"w": "majority"}})); +jsTestLog(`Inserted _id 0 into P1 ${reply.operationTime}`); + +rollbackTest.transitionToRollbackOperations(); +reply = assert.commandWorked(testColl.insert({_id: 1}, {"writeConcern": {"w": 1}})); +jsTestLog(`Inserted _id 1 into P1 ${reply.operationTime}`); + +jsTestLog("Isolating P1 from tiebreaker"); +P1.disconnect([T]); + +jsTestLog("Reconnecting P2 to the tiebreaker"); +P2.reconnect([T]); + +jsTestLog("Step up P2"); +assert.soonNoExcept(() => { + const res = P2.adminCommand({replSetStepUp: 1}); + return res.ok; +}, "Failed to step up P2", ReplSetTest.kDefaultTimeoutMS); +// "Transition to primary complete; database writes are now permitted". +checkLog.containsJson(P2, 21331); +jsTestLog("P2 stepped up"); + +reply = assert.commandWorked(testDB.runCommand({delete: "test", deletes: [{q: {}, limit: 0}]})); +jsTestLog(`Deleted from P1 at ${reply.operationTime}`); + +// Ensure P1's lastApplied > P2's, even if P1's set-up entry was written at the same timestamp as +// P2's delete timestamp. +assert.soon(() => { + testDB.runCommand({insert: "otherCollection", documents: [{}]}); + function lastApplied(node) { + const reply = assert.commandWorked(node.adminCommand({replSetGetStatus: 1})); + return reply.optimes.appliedOpTime.ts; + } + const P1applied = lastApplied(P1); + const P2applied = lastApplied(P2); + jsTestLog(`P1 lastApplied ${P1applied}, P2 lastApplied ${P2applied}`); + return timestampCmp(P1applied, P2applied) > 0; +}, "P1's lastApplied never surpassed P2's"); + +jsTestLog("Reconnecting P1 to P2, so P1 rolls back"); +P1.reconnect([P2]); +// "Rollback using the 'rollbackViaRefetch' method". +checkLog.containsJson(P1, 21103); +// "Finding the Common Point". +checkLog.containsJson(P1, 21682); +// "We cannot transition to SECONDARY state because our 'lastApplied' optime is less than the +// initial data timestamp and enableMajorityReadConcern = false". +checkLog.containsJson(P1, 4851800); + +reply = assert.commandWorked( + P2.getDB(jsTestName()).runCommand({insert: "anotherCollection", documents: [{}]})); +jsTestLog(`Inserted into P2 at ${tojson(reply.operationTime)}`); + +jsTestLog("Wait for P1 to enter SECONDARY"); +waitForState(P1, ReplSetTest.State.SECONDARY); + +// Both counts should be 1. If SERVER-48518 isn't fixed then itCount() = 3: _ids 0, 1, and 0 again! +jsTestLog("Check collection count"); +let itCount = testColl.find().itcount(); +let fastCount = testColl.count(); +assert.eq(itCount, + fastCount, + `count: ${fastCount}, itCount: ${itCount}, data: ${tojson(testColl.find().toArray())}`); + +jsTestLog("Check succeeded. Ending test."); +P1.reconnect([T]); +restartServerReplication(T); +rst.stopSet(); +}()); diff --git a/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js b/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js index e9bd8876672..37bb9da1d31 100644 --- a/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js +++ b/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js @@ -42,7 +42,8 @@ assert.commandWorked(rollbackDB.adminCommand( assert.commandWorked(rollbackDB.runCommand({create: collName})); // Step down the current primary and elect the node that does not have the collection. -rollbackTest.transitionToSyncSourceOperationsBeforeRollback(); +rollbackTest.transitionToSyncSourceOperationsBeforeRollback( + {awaitPrimaryAppliedSurpassesRollbackApplied: true}); jsTestLog("Attempting to roll back."); // Make the old primary rollback against the new primary. This attempt should fail because the diff --git a/jstests/replsets/transactions_after_rollback_via_refetch.js b/jstests/replsets/transactions_after_rollback_via_refetch.js index 80ef4a8ded9..16c1e56b6d6 100644 --- a/jstests/replsets/transactions_after_rollback_via_refetch.js +++ b/jstests/replsets/transactions_after_rollback_via_refetch.js @@ -70,7 +70,8 @@ CommonOps(rollbackTest.getPrimary()); let rollbackNode = rollbackTest.transitionToRollbackOperations(); RollbackOps(rollbackNode); -let syncSourceNode = rollbackTest.transitionToSyncSourceOperationsBeforeRollback(); +let syncSourceNode = rollbackTest.transitionToSyncSourceOperationsBeforeRollback( + {awaitPrimaryAppliedSurpassesRollbackApplied: true}); SyncSourceOps(syncSourceNode); // Wait for rollback to finish. |