SERVER-45178 Always update rollbackId before truncating oplog for rollback-via-refetch

create mode 100644 jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js (cherry picked from commit 04a2c9acc7ca061fb86736b377b897b11f6c7c48)
author: Vesselina Ratcheva <vesselina.ratcheva@10gen.com> 2020-02-13 19:39:27 -0500
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-02-26 07:57:39 +0000
commit: f74e24b16ec0cae0d1521c34379a0e0854cd4607 (patch)
tree: 824fc805c2594a0ec3995399c4f2b249a4e4beb3
parent: b4181a05a619ef07132891e4abbedf6903ff0d10 (diff)
download: mongo-f74e24b16ec0cae0d1521c34379a0e0854cd4607.tar.gz
4 files changed, 88 insertions, 32 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index b41e335bf64..208866427ef 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -47,6 +47,8 @@ replica_sets_multiversion:
   test_file: jstests/replsets/rollback_dup_ids_clean_shutdown_during_rollback.js
 - ticket: SERVER-45906
   test_file: jstests/replsets/trigger_initial_stable_checkpoint.js
+- ticket: SERVER-45178
+  test_file: jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
 
 sharding_multiversion:
 - ticket: SERVER-38691
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js
index f138448abd9..feb20d1baa7 100644
--- a/jstests/replsets/libs/rollback_test.js
+++ b/jstests/replsets/libs/rollback_test.js
@@ -245,28 +245,21 @@ function RollbackTest(name = "RollbackTest", replSet) {
      * be replicated to all nodes and should not be rolled back.
      */
     this.transitionToSteadyStateOperations = function({skipDataConsistencyChecks = false} = {}) {
-        if (this.isMajorityReadConcernEnabledOnRollbackNode) {
-            log(`Waiting for rollback to complete on ${curSecondary.host}`, true);
-            let rbid = -1;
-            assert.soon(() => {
-                try {
-                    rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
-                } catch (e) {
-                    // Command can fail when sync source is being cleared.
-                }
-                // Fail early if the rbid is greater than lastRBID+1.
-                assert.lte(rbid,
-                           lastRBID + 1,
-                           `RBID is too large. current RBID: ${rbid}, last RBID: ${lastRBID}`);
-
-                return rbid === lastRBID + 1;
-            }, "Timed out waiting for RBID to increment on " + curSecondary.host);
-        } else {
-            // TODO: After fixing SERVER-45178, we can remove the else block as we are guaranteed
-            // that the rollback id will get updated if the rollback has happened on that node.
-            log(`Skipping RBID check on ${curSecondary.host} because shutdowns ` +
-                `may prevent a rollback here.`);
-        }
+        log(`Waiting for rollback to complete on ${curSecondary.host}`, true);
+        let rbid = -1;
+        assert.soon(() => {
+            try {
+                rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
+            } catch (e) {
+                // Command can fail when sync source is being cleared.
+            }
+            // Fail early if the rbid is greater than lastRBID+1.
+            assert.lte(rbid,
+                       lastRBID + 1,
+                       `RBID is too large. current RBID: ${rbid}, last RBID: ${lastRBID}`);
+
+            return rbid === lastRBID + 1;
+        }, "Timed out waiting for RBID to increment on " + curSecondary.host);
 
         // Ensure that the tiebreaker node is connected to the other nodes. We must do this after
         // we are sure that rollback has completed on the rollback node.
@@ -312,12 +305,6 @@ function RollbackTest(name = "RollbackTest", replSet) {
         rst.awaitSecondaryNodes();
         rst.awaitReplication(null, null, [curSecondary]);
 
-        // The current primary will be the node that rolls back. Check if it supports majority reads
-        // here while we are in a steady state.
-        this.isMajorityReadConcernEnabledOnRollbackNode =
-            assert.commandWorked(curPrimary.adminCommand({serverStatus: 1}))
-                .storageEngine.supportsCommittedReads;
-
         transitionIfAllowed(State.kRollbackOps);
 
         // Disconnect the secondary from the tiebreaker node before we disconnect the secondary from
diff --git a/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js b/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
new file mode 100644
index 00000000000..0effc3cc1f5
--- /dev/null
+++ b/jstests/replsets/rollback_via_refetch_update_rollback_id_before_oplog_truncation.js
@@ -0,0 +1,66 @@
+/**
+ * This test demonstrates that rollback via refetch always increments the rollback id as soon as it
+ * resolves the common point and before proceeding with other operations.
+ *
+ * This is a regression test that makes sure we avoid the scenario where we truncate our oplog (at
+ * which point the rollback is effectively finished), then shut down uncleanly before we get a
+ * chance to update the rollbackId.
+ *
+ * @tags: [requires_journaling]
+ */
+
+(function() {
+"use strict";
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/libs/rollback_test.js");
+load("jstests/replsets/rslib.js");
+
+const name = jsTestName();
+TestData.allowUncleanShutdowns = true;
+
+jsTest.log("Set up a RollbackTest with enableMajorityReadConcern=false");
+const rst = new ReplSetTest({
+    name,
+    nodes: [{}, {}, {rsConfig: {priority: 0}}],
+    useBridge: true,
+    nodeOptions: {enableMajorityReadConcern: "false"},
+    settings: {chainingAllowed: false}
+});
+
+rst.startSet();
+rst.initiateWithHighElectionTimeout();
+
+const rollbackTest = new RollbackTest(name, rst);
+const rollbackNode = rollbackTest.transitionToRollbackOperations();
+
+const baseRBID = assert.commandWorked(rollbackNode.adminCommand("replSetGetRBID")).rbid;
+
+rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+
+jsTestLog("Make rollback-via-refetch exit early after truncating the oplog");
+assert.commandWorked(rollbackNode.adminCommand(
+    {configureFailPoint: "rollbackExitEarlyAfterCollectionDrop", mode: "alwaysOn"}));
+
+rollbackTest.transitionToSyncSourceOperationsDuringRollback();
+
+jsTestLog("Wait until we hit the failpoint");
+assert.soonNoExcept(function() {
+    assert.commandWorked(rollbackNode.adminCommand({
+        waitForFailPoint: "rollbackExitEarlyAfterCollectionDrop",
+        timesEntered: 1,
+        maxTimeMS: kDefaultWaitForFailPointTimeout
+    }));
+    return true;
+}, "failed to reconnect for waitForFailPoint");
+
+// Check that the RBID has still managed to advance.
+// Looking at the RBID directly is our first line of defense.
+assert.eq(baseRBID + 1, assert.commandWorked(rollbackNode.adminCommand("replSetGetRBID")).rbid);
+
+assert.commandWorked(rollbackNode.adminCommand(
+    {configureFailPoint: "rollbackExitEarlyAfterCollectionDrop", mode: "off"}));
+
+// Verify that the node can rejoin the set as normal.
+rollbackTest.transitionToSteadyStateOperations();
+rollbackTest.stop();
+}());
+\ No newline at end of file
diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp
index 16272236fb1..2e8b3768297 100644
--- a/src/mongo/db/repl/rs_rollback.cpp
+++ b/src/mongo/db/repl/rs_rollback.cpp
@@ -989,10 +989,11 @@ Status _syncRollback(OperationContext* opCtx,
     invariant(commonPoint >= committedSnapshot);
 
     try {
-        ON_BLOCK_EXIT([&] {
-            auto status = replicationProcess->incrementRollbackID(opCtx);
-            fassert(40497, status);
-        });
+        // It is always safe to increment the rollback ID first, even if we fail to complete
+        // the rollback.
+        auto status = replicationProcess->incrementRollbackID(opCtx);
+        fassert(40497, status);
+
         syncFixUp(opCtx, how, rollbackSource, replCoord, replicationProcess);
 
         if (MONGO_FAIL_POINT(rollbackExitEarlyAfterCollectionDrop)) {
author	Vesselina Ratcheva <vesselina.ratcheva@10gen.com>	2020-02-13 19:39:27 -0500
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-02-26 07:57:39 +0000
commit	f74e24b16ec0cae0d1521c34379a0e0854cd4607 (patch)
tree	824fc805c2594a0ec3995399c4f2b249a4e4beb3
parent	b4181a05a619ef07132891e4abbedf6903ff0d10 (diff)
download	mongo-f74e24b16ec0cae0d1521c34379a0e0854cd4607.tar.gz