diff options
author | Spencer T Brody <spencer@mongodb.com> | 2018-02-09 16:58:42 -0500 |
---|---|---|
committer | Spencer T Brody <spencer@mongodb.com> | 2018-02-15 17:19:07 -0500 |
commit | 8dc995ebef6d9ae671a82b1cdc521e0960d32ec8 (patch) | |
tree | 95764008f4d884205d4cdf43a39cd9e0ccc4b622 /jstests/multiVersion/libs/multi_rs.js | |
parent | aa8388487e3bea0737237b48a06e6b20b243e791 (diff) | |
download | mongo-8dc995ebef6d9ae671a82b1cdc521e0960d32ec8.tar.gz |
SERVER-33251 Retry stepdown in multi_rs.js
Diffstat (limited to 'jstests/multiVersion/libs/multi_rs.js')
-rw-r--r-- | jstests/multiVersion/libs/multi_rs.js | 26 |
1 files changed, 18 insertions, 8 deletions
diff --git a/jstests/multiVersion/libs/multi_rs.js b/jstests/multiVersion/libs/multi_rs.js index 40fe9556b99..60bc253d877 100644 --- a/jstests/multiVersion/libs/multi_rs.js +++ b/jstests/multiVersion/libs/multi_rs.js @@ -77,8 +77,9 @@ ReplSetTest.prototype.upgradeNode = function(node, opts = {}, user, pwd) { // Must retry this command, as it might return "currently running for election" and fail. // Node might still be running for an election that will fail because it lost the election // race with another node, at test initialization. See SERVER-23133. - assert.soon(function() { - return (node.adminCommand("replSetMaintenance").ok); + assert.soonNoExcept(function() { + assert.commandWorked(node.adminCommand("replSetMaintenance")); + return true; }); this.waitForState(node, ReplSetTest.State.RECOVERING); } @@ -100,12 +101,21 @@ ReplSetTest.prototype.stepdown = function(nodeId) { assert.eq(this.getNodeId(this.getPrimary()), nodeId); var node = this.nodes[nodeId]; - try { - node.getDB("admin").runCommand({replSetStepDown: 300, secondaryCatchUpPeriodSecs: 60}); - assert(false); - } catch (ex) { - print('Caught exception after stepDown cmd: ' + tojson(ex)); - } + assert.soonNoExcept(function() { + // Due to a rare race condition in stepdown, it's possible the secondary just replicated + // the most recent write and sent replSetUpdatePosition to the primary, and that + // replSetUpdatePosition command gets interrupted by the stepdown. In that case, + // the secondary will clear its sync source, but will be unable to re-connect to the + // primary that is trying to step down, because they are at the same OpTime. The primary + // will then get stuck waiting forever for the secondary to catch up so it can complete + // stepdown. Adding a garbage write here ensures that the secondary will be able to + // resume syncing from the primary in this case, which in turn will let the primary + // finish stepping down successfully. + node.getDB('admin').garbageWriteToAdvanceOpTime.insert({a: 1}); + assert.adminCommandWorkedAllowingNetworkError( + node, {replSetStepDown: 5 * 60, secondaryCatchUpPeriodSecs: 60}); + return true; + }); return this.reconnect(node); }; |