SERVER-33251 Retry stepdown in multi_rs.js

author: Spencer T Brody <spencer@mongodb.com> 2018-02-09 16:58:42 -0500
committer: Spencer T Brody <spencer@mongodb.com> 2018-02-15 17:19:07 -0500
commit: 8dc995ebef6d9ae671a82b1cdc521e0960d32ec8 (patch)
tree: 95764008f4d884205d4cdf43a39cd9e0ccc4b622 /jstests/multiVersion/libs/multi_rs.js
parent: aa8388487e3bea0737237b48a06e6b20b243e791 (diff)
download: mongo-8dc995ebef6d9ae671a82b1cdc521e0960d32ec8.tar.gz
1 files changed, 18 insertions, 8 deletions
diff --git a/jstests/multiVersion/libs/multi_rs.js b/jstests/multiVersion/libs/multi_rs.js
index 40fe9556b99..60bc253d877 100644
--- a/jstests/multiVersion/libs/multi_rs.js
+++ b/jstests/multiVersion/libs/multi_rs.js
@@ -77,8 +77,9 @@ ReplSetTest.prototype.upgradeNode = function(node, opts = {}, user, pwd) {
         // Must retry this command, as it might return "currently running for election" and fail.
         // Node might still be running for an election that will fail because it lost the election
         // race with another node, at test initialization.  See SERVER-23133.
-        assert.soon(function() {
-            return (node.adminCommand("replSetMaintenance").ok);
+        assert.soonNoExcept(function() {
+            assert.commandWorked(node.adminCommand("replSetMaintenance"));
+            return true;
         });
         this.waitForState(node, ReplSetTest.State.RECOVERING);
     }
@@ -100,12 +101,21 @@ ReplSetTest.prototype.stepdown = function(nodeId) {
     assert.eq(this.getNodeId(this.getPrimary()), nodeId);
     var node = this.nodes[nodeId];
 
-    try {
-        node.getDB("admin").runCommand({replSetStepDown: 300, secondaryCatchUpPeriodSecs: 60});
-        assert(false);
-    } catch (ex) {
-        print('Caught exception after stepDown cmd: ' + tojson(ex));
-    }
+    assert.soonNoExcept(function() {
+        // Due to a rare race condition in stepdown, it's possible the secondary just replicated
+        // the most recent write and sent replSetUpdatePosition to the primary, and that
+        // replSetUpdatePosition command gets interrupted by the stepdown.  In that case,
+        // the secondary will clear its sync source, but will be unable to re-connect to the
+        // primary that is trying to step down, because they are at the same OpTime.  The primary
+        // will then get stuck waiting forever for the secondary to catch up so it can complete
+        // stepdown.  Adding a garbage write here ensures that the secondary will be able to
+        // resume syncing from the primary in this case, which in turn will let the primary
+        // finish stepping down successfully.
+        node.getDB('admin').garbageWriteToAdvanceOpTime.insert({a: 1});
+        assert.adminCommandWorkedAllowingNetworkError(
+            node, {replSetStepDown: 5 * 60, secondaryCatchUpPeriodSecs: 60});
+        return true;
+    });
 
     return this.reconnect(node);
 };
author	Spencer T Brody <spencer@mongodb.com>	2018-02-09 16:58:42 -0500
committer	Spencer T Brody <spencer@mongodb.com>	2018-02-15 17:19:07 -0500
commit	8dc995ebef6d9ae671a82b1cdc521e0960d32ec8 (patch)
tree	95764008f4d884205d4cdf43a39cd9e0ccc4b622 /jstests/multiVersion/libs/multi_rs.js
parent	aa8388487e3bea0737237b48a06e6b20b243e791 (diff)
download	mongo-8dc995ebef6d9ae671a82b1cdc521e0960d32ec8.tar.gz