summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVesselina Ratcheva <vesselina.ratcheva@10gen.com>2020-02-13 19:39:27 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-02-26 07:57:39 +0000
commitf74e24b16ec0cae0d1521c34379a0e0854cd4607 (patch)
tree824fc805c2594a0ec3995399c4f2b249a4e4beb3
parentb4181a05a619ef07132891e4abbedf6903ff0d10 (diff)
downloadmongo-f74e24b16ec0cae0d1521c34379a0e0854cd4607.tar.gz
SERVER-45178 Always update rollbackId before truncating oplog for rollback-via-refetch
create mode 100644 jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js (cherry picked from commit 04a2c9acc7ca061fb86736b377b897b11f6c7c48)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/replsets/libs/rollback_test.js43
-rw-r--r--jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js66
-rw-r--r--src/mongo/db/repl/rs_rollback.cpp9
4 files changed, 88 insertions, 32 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index b41e335bf64..208866427ef 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -47,6 +47,8 @@ replica_sets_multiversion:
test_file: jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js
- ticket: SERVER-45906
test_file: jstests/replsets/trigger_initial_stable_checkpoint.js
+- ticket: SERVER-45178
+ test_file: jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
sharding_multiversion:
- ticket: SERVER-38691
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js
index f138448abd9..feb20d1baa7 100644
--- a/jstests/replsets/libs/rollback_test.js
+++ b/jstests/replsets/libs/rollback_test.js
@@ -245,28 +245,21 @@ function RollbackTest(name = "RollbackTest", replSet) {
* be replicated to all nodes and should not be rolled back.
*/
this.transitionToSteadyStateOperations = function({skipDataConsistencyChecks = false} = {}) {
- if (this.isMajorityReadConcernEnabledOnRollbackNode) {
- log(`Waiting for rollback to complete on ${curSecondary.host}`, true);
- let rbid = -1;
- assert.soon(() => {
- try {
- rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
- } catch (e) {
- // Command can fail when sync source is being cleared.
- }
- // Fail early if the rbid is greater than lastRBID+1.
- assert.lte(rbid,
- lastRBID + 1,
- `RBID is too large. current RBID: ${rbid}, last RBID: ${lastRBID}`);
-
- return rbid === lastRBID + 1;
- }, "Timed out waiting for RBID to increment on " + curSecondary.host);
- } else {
- // TODO: After fixing SERVER-45178, we can remove the else block as we are guaranteed
- // that the rollback id will get updated if the rollback has happened on that node.
- log(`Skipping RBID check on ${curSecondary.host} because shutdowns ` +
- `may prevent a rollback here.`);
- }
+ log(`Waiting for rollback to complete on ${curSecondary.host}`, true);
+ let rbid = -1;
+ assert.soon(() => {
+ try {
+ rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
+ } catch (e) {
+ // Command can fail when sync source is being cleared.
+ }
+ // Fail early if the rbid is greater than lastRBID+1.
+ assert.lte(rbid,
+ lastRBID + 1,
+ `RBID is too large. current RBID: ${rbid}, last RBID: ${lastRBID}`);
+
+ return rbid === lastRBID + 1;
+ }, "Timed out waiting for RBID to increment on " + curSecondary.host);
// Ensure that the tiebreaker node is connected to the other nodes. We must do this after
// we are sure that rollback has completed on the rollback node.
@@ -312,12 +305,6 @@ function RollbackTest(name = "RollbackTest", replSet) {
rst.awaitSecondaryNodes();
rst.awaitReplication(null, null, [curSecondary]);
- // The current primary will be the node that rolls back. Check if it supports majority reads
- // here while we are in a steady state.
- this.isMajorityReadConcernEnabledOnRollbackNode =
- assert.commandWorked(curPrimary.adminCommand({serverStatus: 1}))
- .storageEngine.supportsCommittedReads;
-
transitionIfAllowed(State.kRollbackOps);
// Disconnect the secondary from the tiebreaker node before we disconnect the secondary from
diff --git a/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js b/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
new file mode 100644
index 00000000000..0effc3cc1f5
--- /dev/null
+++ b/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
@@ -0,0 +1,66 @@
+/**
+ * This test demonstrates that rollback via refetch always increments the rollback id as soon as it
+ * resolves the common point and before proceeding with other operations.
+ *
+ * This is a regression test that makes sure we avoid the scenario where we truncate our oplog (at
+ * which point the rollback is effectively finished), then shut down uncleanly before we get a
+ * chance to update the rollbackId.
+ *
+ * @tags: [requires_journaling]
+ */
+
+(function() {
+"use strict";
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/libs/rollback_test.js");
+load("jstests/replsets/rslib.js");
+
+const name = jsTestName();
+TestData.allowUncleanShutdowns = true;
+
+jsTest.log("Set up a RollbackTest with enableMajorityReadConcern=false");
+const rst = new ReplSetTest({
+ name,
+ nodes: [{}, {}, {rsConfig: {priority: 0}}],
+ useBridge: true,
+ nodeOptions: {enableMajorityReadConcern: "false"},
+ settings: {chainingAllowed: false}
+});
+
+rst.startSet();
+rst.initiateWithHighElectionTimeout();
+
+const rollbackTest = new RollbackTest(name, rst);
+const rollbackNode = rollbackTest.transitionToRollbackOperations();
+
+const baseRBID = assert.commandWorked(rollbackNode.adminCommand("replSetGetRBID")).rbid;
+
+rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+
+jsTestLog("Make rollback-via-refetch exit early after truncating the oplog");
+assert.commandWorked(rollbackNode.adminCommand(
+ {configureFailPoint: "rollbackExitEarlyAfterCollectionDrop", mode: "alwaysOn"}));
+
+rollbackTest.transitionToSyncSourceOperationsDuringRollback();
+
+jsTestLog("Wait until we hit the failpoint");
+assert.soonNoExcept(function() {
+ assert.commandWorked(rollbackNode.adminCommand({
+ waitForFailPoint: "rollbackExitEarlyAfterCollectionDrop",
+ timesEntered: 1,
+ maxTimeMS: kDefaultWaitForFailPointTimeout
+ }));
+ return true;
+}, "failed to reconnect for waitForFailPoint");
+
+// Check that the RBID has still managed to advance.
+// Looking at the RBID directly is our first line of defense.
+assert.eq(baseRBID + 1, assert.commandWorked(rollbackNode.adminCommand("replSetGetRBID")).rbid);
+
+assert.commandWorked(rollbackNode.adminCommand(
+ {configureFailPoint: "rollbackExitEarlyAfterCollectionDrop", mode: "off"}));
+
+// Verify that the node can rejoin the set as normal.
+rollbackTest.transitionToSteadyStateOperations();
+rollbackTest.stop();
+}()); \ No newline at end of file
diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp
index 16272236fb1..2e8b3768297 100644
--- a/src/mongo/db/repl/rs_rollback.cpp
+++ b/src/mongo/db/repl/rs_rollback.cpp
@@ -989,10 +989,11 @@ Status _syncRollback(OperationContext* opCtx,
invariant(commonPoint >= committedSnapshot);
try {
- ON_BLOCK_EXIT([&] {
- auto status = replicationProcess->incrementRollbackID(opCtx);
- fassert(40497, status);
- });
+ // It is always safe to increment the rollback ID first, even if we fail to complete
+ // the rollback.
+ auto status = replicationProcess->incrementRollbackID(opCtx);
+ fassert(40497, status);
+
syncFixUp(opCtx, how, rollbackSource, replCoord, replicationProcess);
if (MONGO_FAIL_POINT(rollbackExitEarlyAfterCollectionDrop)) {