summaryrefslogtreecommitdiff
path: root/jstests/multiVersion/libs/multi_rs.js
diff options
context:
space:
mode:
authorSpencer T Brody <spencer@mongodb.com>2018-02-09 16:58:42 -0500
committerSpencer T Brody <spencer@mongodb.com>2018-02-15 17:19:07 -0500
commit8dc995ebef6d9ae671a82b1cdc521e0960d32ec8 (patch)
tree95764008f4d884205d4cdf43a39cd9e0ccc4b622 /jstests/multiVersion/libs/multi_rs.js
parentaa8388487e3bea0737237b48a06e6b20b243e791 (diff)
downloadmongo-8dc995ebef6d9ae671a82b1cdc521e0960d32ec8.tar.gz
SERVER-33251 Retry stepdown in multi_rs.js
Diffstat (limited to 'jstests/multiVersion/libs/multi_rs.js')
-rw-r--r--jstests/multiVersion/libs/multi_rs.js26
1 files changed, 18 insertions, 8 deletions
diff --git a/jstests/multiVersion/libs/multi_rs.js b/jstests/multiVersion/libs/multi_rs.js
index 40fe9556b99..60bc253d877 100644
--- a/jstests/multiVersion/libs/multi_rs.js
+++ b/jstests/multiVersion/libs/multi_rs.js
@@ -77,8 +77,9 @@ ReplSetTest.prototype.upgradeNode = function(node, opts = {}, user, pwd) {
// Must retry this command, as it might return "currently running for election" and fail.
// Node might still be running for an election that will fail because it lost the election
// race with another node, at test initialization. See SERVER-23133.
- assert.soon(function() {
- return (node.adminCommand("replSetMaintenance").ok);
+ assert.soonNoExcept(function() {
+ assert.commandWorked(node.adminCommand("replSetMaintenance"));
+ return true;
});
this.waitForState(node, ReplSetTest.State.RECOVERING);
}
@@ -100,12 +101,21 @@ ReplSetTest.prototype.stepdown = function(nodeId) {
assert.eq(this.getNodeId(this.getPrimary()), nodeId);
var node = this.nodes[nodeId];
- try {
- node.getDB("admin").runCommand({replSetStepDown: 300, secondaryCatchUpPeriodSecs: 60});
- assert(false);
- } catch (ex) {
- print('Caught exception after stepDown cmd: ' + tojson(ex));
- }
+ assert.soonNoExcept(function() {
+ // Due to a rare race condition in stepdown, it's possible the secondary just replicated
+ // the most recent write and sent replSetUpdatePosition to the primary, and that
+ // replSetUpdatePosition command gets interrupted by the stepdown. In that case,
+ // the secondary will clear its sync source, but will be unable to re-connect to the
+ // primary that is trying to step down, because they are at the same OpTime. The primary
+ // will then get stuck waiting forever for the secondary to catch up so it can complete
+ // stepdown. Adding a garbage write here ensures that the secondary will be able to
+ // resume syncing from the primary in this case, which in turn will let the primary
+ // finish stepping down successfully.
+ node.getDB('admin').garbageWriteToAdvanceOpTime.insert({a: 1});
+ assert.adminCommandWorkedAllowingNetworkError(
+ node, {replSetStepDown: 5 * 60, secondaryCatchUpPeriodSecs: 60});
+ return true;
+ });
return this.reconnect(node);
};