diff options
author | Vesselina Ratcheva <vesselina.ratcheva@10gen.com> | 2020-02-13 19:39:27 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-02-26 07:57:39 +0000 |
commit | f74e24b16ec0cae0d1521c34379a0e0854cd4607 (patch) | |
tree | 824fc805c2594a0ec3995399c4f2b249a4e4beb3 | |
parent | b4181a05a619ef07132891e4abbedf6903ff0d10 (diff) | |
download | mongo-f74e24b16ec0cae0d1521c34379a0e0854cd4607.tar.gz |
SERVER-45178 Always update rollbackId before truncating oplog for rollback-via-refetch
create mode 100644 jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
(cherry picked from commit 04a2c9acc7ca061fb86736b377b897b11f6c7c48)
4 files changed, 88 insertions, 32 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index b41e335bf64..208866427ef 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -47,6 +47,8 @@ replica_sets_multiversion: test_file: jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js - ticket: SERVER-45906 test_file: jstests/replsets/trigger_initial_stable_checkpoint.js +- ticket: SERVER-45178 + test_file: jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js sharding_multiversion: - ticket: SERVER-38691 diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js index f138448abd9..feb20d1baa7 100644 --- a/jstests/replsets/libs/rollback_test.js +++ b/jstests/replsets/libs/rollback_test.js @@ -245,28 +245,21 @@ function RollbackTest(name = "RollbackTest", replSet) { * be replicated to all nodes and should not be rolled back. */ this.transitionToSteadyStateOperations = function({skipDataConsistencyChecks = false} = {}) { - if (this.isMajorityReadConcernEnabledOnRollbackNode) { - log(`Waiting for rollback to complete on ${curSecondary.host}`, true); - let rbid = -1; - assert.soon(() => { - try { - rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; - } catch (e) { - // Command can fail when sync source is being cleared. - } - // Fail early if the rbid is greater than lastRBID+1. - assert.lte(rbid, - lastRBID + 1, - `RBID is too large. current RBID: ${rbid}, last RBID: ${lastRBID}`); - - return rbid === lastRBID + 1; - }, "Timed out waiting for RBID to increment on " + curSecondary.host); - } else { - // TODO: After fixing SERVER-45178, we can remove the else block as we are guaranteed - // that the rollback id will get updated if the rollback has happened on that node. - log(`Skipping RBID check on ${curSecondary.host} because shutdowns ` + - `may prevent a rollback here.`); - } + log(`Waiting for rollback to complete on ${curSecondary.host}`, true); + let rbid = -1; + assert.soon(() => { + try { + rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; + } catch (e) { + // Command can fail when sync source is being cleared. + } + // Fail early if the rbid is greater than lastRBID+1. + assert.lte(rbid, + lastRBID + 1, + `RBID is too large. current RBID: ${rbid}, last RBID: ${lastRBID}`); + + return rbid === lastRBID + 1; + }, "Timed out waiting for RBID to increment on " + curSecondary.host); // Ensure that the tiebreaker node is connected to the other nodes. We must do this after // we are sure that rollback has completed on the rollback node. @@ -312,12 +305,6 @@ function RollbackTest(name = "RollbackTest", replSet) { rst.awaitSecondaryNodes(); rst.awaitReplication(null, null, [curSecondary]); - // The current primary will be the node that rolls back. Check if it supports majority reads - // here while we are in a steady state. - this.isMajorityReadConcernEnabledOnRollbackNode = - assert.commandWorked(curPrimary.adminCommand({serverStatus: 1})) - .storageEngine.supportsCommittedReads; - transitionIfAllowed(State.kRollbackOps); // Disconnect the secondary from the tiebreaker node before we disconnect the secondary from diff --git a/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js b/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js new file mode 100644 index 00000000000..0effc3cc1f5 --- /dev/null +++ b/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js @@ -0,0 +1,66 @@ +/** + * This test demonstrates that rollback via refetch always increments the rollback id as soon as it + * resolves the common point and before proceeding with other operations. + * + * This is a regression test that makes sure we avoid the scenario where we truncate our oplog (at + * which point the rollback is effectively finished), then shut down uncleanly before we get a + * chance to update the rollbackId. + * + * @tags: [requires_journaling] + */ + +(function() { +"use strict"; +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/libs/rollback_test.js"); +load("jstests/replsets/rslib.js"); + +const name = jsTestName(); +TestData.allowUncleanShutdowns = true; + +jsTest.log("Set up a RollbackTest with enableMajorityReadConcern=false"); +const rst = new ReplSetTest({ + name, + nodes: [{}, {}, {rsConfig: {priority: 0}}], + useBridge: true, + nodeOptions: {enableMajorityReadConcern: "false"}, + settings: {chainingAllowed: false} +}); + +rst.startSet(); +rst.initiateWithHighElectionTimeout(); + +const rollbackTest = new RollbackTest(name, rst); +const rollbackNode = rollbackTest.transitionToRollbackOperations(); + +const baseRBID = assert.commandWorked(rollbackNode.adminCommand("replSetGetRBID")).rbid; + +rollbackTest.transitionToSyncSourceOperationsBeforeRollback(); + +jsTestLog("Make rollback-via-refetch exit early after truncating the oplog"); +assert.commandWorked(rollbackNode.adminCommand( + {configureFailPoint: "rollbackExitEarlyAfterCollectionDrop", mode: "alwaysOn"})); + +rollbackTest.transitionToSyncSourceOperationsDuringRollback(); + +jsTestLog("Wait until we hit the failpoint"); +assert.soonNoExcept(function() { + assert.commandWorked(rollbackNode.adminCommand({ + waitForFailPoint: "rollbackExitEarlyAfterCollectionDrop", + timesEntered: 1, + maxTimeMS: kDefaultWaitForFailPointTimeout + })); + return true; +}, "failed to reconnect for waitForFailPoint"); + +// Check that the RBID has still managed to advance. +// Looking at the RBID directly is our first line of defense. +assert.eq(baseRBID + 1, assert.commandWorked(rollbackNode.adminCommand("replSetGetRBID")).rbid); + +assert.commandWorked(rollbackNode.adminCommand( + {configureFailPoint: "rollbackExitEarlyAfterCollectionDrop", mode: "off"})); + +// Verify that the node can rejoin the set as normal. +rollbackTest.transitionToSteadyStateOperations(); +rollbackTest.stop(); +}());
\ No newline at end of file diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp index 16272236fb1..2e8b3768297 100644 --- a/src/mongo/db/repl/rs_rollback.cpp +++ b/src/mongo/db/repl/rs_rollback.cpp @@ -989,10 +989,11 @@ Status _syncRollback(OperationContext* opCtx, invariant(commonPoint >= committedSnapshot); try { - ON_BLOCK_EXIT([&] { - auto status = replicationProcess->incrementRollbackID(opCtx); - fassert(40497, status); - }); + // It is always safe to increment the rollback ID first, even if we fail to complete + // the rollback. + auto status = replicationProcess->incrementRollbackID(opCtx); + fassert(40497, status); + syncFixUp(opCtx, how, rollbackSource, replCoord, replicationProcess); if (MONGO_FAIL_POINT(rollbackExitEarlyAfterCollectionDrop)) { |