diff options
author | Suganthi Mani <suganthi.mani@mongodb.com> | 2020-06-30 02:37:34 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-11-20 06:45:42 +0000 |
commit | db72156b34591a37f98f1eeae0e5d0c67ed555ff (patch) | |
tree | 650efdc3a77533dc75c53d83798746b556a7546f | |
parent | 9870937b91b88348e619580f1050965b1006e33d (diff) | |
download | mongo-db72156b34591a37f98f1eeae0e5d0c67ed555ff.tar.gz |
SERVER-43847 Make ReplSetTest's stepUp function resilient to slow machines.
(cherry picked from commit c5a53e4882bd316dcb37141ccfab56f5acaec8f4)
SERVER-49187 Make ReplSetTest.stepUp() robust to election failures.
(cherry picked from commit 311b7982f61009fd08bd7b76b1638d62cc8703de)
-rw-r--r-- | jstests/replsets/avg_num_catchup_ops.js | 3 | ||||
-rw-r--r-- | jstests/replsets/catchup.js | 4 | ||||
-rw-r--r-- | jstests/replsets/change_stream_stepdown.js | 29 | ||||
-rw-r--r-- | jstests/replsets/reconstruct_prepared_transactions_initial_sync.js | 4 | ||||
-rw-r--r-- | jstests/replsets/rslib.js | 3 | ||||
-rw-r--r-- | jstests/replsets/step_down_during_draining.js | 6 | ||||
-rw-r--r-- | jstests/replsets/step_down_during_draining2.js | 4 | ||||
-rw-r--r-- | jstests/replsets/step_down_during_draining3.js | 2 | ||||
-rw-r--r-- | src/mongo/shell/assert.js | 14 | ||||
-rw-r--r-- | src/mongo/shell/replsettest.js | 82 |
10 files changed, 87 insertions, 64 deletions
diff --git a/jstests/replsets/avg_num_catchup_ops.js b/jstests/replsets/avg_num_catchup_ops.js index 5a3bd488722..418f7770ac4 100644 --- a/jstests/replsets/avg_num_catchup_ops.js +++ b/jstests/replsets/avg_num_catchup_ops.js @@ -46,8 +46,7 @@ assert(testNodeServerStatus.electionMetrics.averageCatchUpOps, assert.eq(testNodeServerStatus.electionMetrics.averageCatchUpOps, 4); // Step up another node temporarily. -const tempPrimary = rst.stepUpNoAwaitReplication(rst.getSecondaries()[0]); -assert.eq(tempPrimary, rst.getPrimary()); +const tempPrimary = rst.stepUp(rst.getSecondaries()[0]); rst.awaitReplication(); // Step up the testNode and force it to catchup again. diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js index d565fa2b5d5..fd8f9b9a699 100644 --- a/jstests/replsets/catchup.js +++ b/jstests/replsets/catchup.js @@ -62,9 +62,7 @@ let initialNewPrimaryStatus = assert.commandWorked(rst.getSecondary().adminCommand({serverStatus: 1})); // Should complete transition to primary immediately. -var newPrimary = rst.stepUpNoAwaitReplication(rst.getSecondary()); -// Should win an election and finish the transition very quickly. -assert.eq(newPrimary, rst.getPrimary()); +var newPrimary = rst.stepUp(rst.getSecondary(), {awaitReplicationBeforeStepUp: false}); rst.awaitReplication(); // Check that the 'numCatchUps' field has not been incremented in serverStatus. diff --git a/jstests/replsets/change_stream_stepdown.js b/jstests/replsets/change_stream_stepdown.js index 68219931b6f..6cd1eaa2d60 100644 --- a/jstests/replsets/change_stream_stepdown.js +++ b/jstests/replsets/change_stream_stepdown.js @@ -18,11 +18,8 @@ replTest.initiateWithHighElectionTimeout(); function stepUp(replTest, conn) { assert.commandWorked(conn.adminCommand({replSetFreeze: 0})); - // Steps up the node in conn but this function does not wait for the new primary to be able to - // accept writes. - replTest.stepUpNoAwaitReplication(conn); - // Waits for the new primary to accept new writes. - return replTest.getPrimary(); + // Steps up the node in conn and awaits for the stepped up node to become writable primary. + return replTest.stepUp(conn, {awaitReplicationBeforeStepUp: false}); } const dbName = name; @@ -30,9 +27,7 @@ const collName = "change_stream_stepdown"; const changeStreamComment = collName + "_comment"; const primary = replTest.getPrimary(); -const secondary = replTest.getSecondary(); const primaryDb = primary.getDB(dbName); -const secondaryDb = secondary.getDB(dbName); const primaryColl = primaryDb[collName]; // Open a change stream. @@ -96,9 +91,10 @@ jsTestLog("Testing that changestream waiting on old primary sees docs inserted o replTest.awaitReplication(); // Ensure secondary is up to date and can win an election. -function shellFn(secondaryHost, dbName, collName, changeStreamComment, stepUpFn) { +function shellFn(dbName, collName, changeStreamComment, stepUpFn) { // Wait for the getMore to be in progress. - assert.soon(() => db.getSiblingDB("admin") + const primary = db.getMongo(); + assert.soon(() => primary.getDB("admin") .aggregate([ {'$currentOp': {}}, { @@ -110,19 +106,18 @@ function shellFn(secondaryHost, dbName, collName, changeStreamComment, stepUpFn) ]) .itcount() == 1); - const replTest = new ReplSetTest(secondaryHost); - const secondary = new Mongo(secondaryHost); - const secondaryDb = secondary.getDB(dbName); + const replTest = new ReplSetTest(primary.host); + // Step down the old primary and wait for new primary. - jsTestLog(`Stepping up ${secondaryHost} and waiting for new primary`); - stepUpFn(replTest, secondary); + const newPrimary = stepUpFn(replTest, replTest.getSecondary()); + const newPrimaryDB = newPrimary.getDB(dbName); + assert.neq(newPrimary, primary, "Primary didn't change."); jsTestLog("Inserting document on new primary"); - assert.commandWorked(secondaryDb[collName].insert({_id: 4}), {writeConcern: {w: "majority"}}); + assert.commandWorked(newPrimaryDB[collName].insert({_id: 4}), {writeConcern: {w: "majority"}}); } let waitForShell = startParallelShell( - funWithArgs(shellFn, secondary.host, dbName, collName, changeStreamComment, stepUp), - primary.port); + funWithArgs(shellFn, dbName, collName, changeStreamComment, stepUp), primary.port); res = assert.commandWorked(primaryDb.runCommand({ getMore: cursorId, diff --git a/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js b/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js index 4458bbbc4b1..6257f066eab 100644 --- a/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js +++ b/jstests/replsets/reconstruct_prepared_transactions_initial_sync.js @@ -172,9 +172,7 @@ jsTestLog("Stepping up the secondary"); // Step up the secondary after initial sync is done and make sure the other two transactions are // properly prepared. -replTest.stepUpNoAwaitReplication(secondary); -replTest.waitForState(secondary, ReplSetTest.State.PRIMARY); -const newPrimary = replTest.getPrimary(); +const newPrimary = replTest.stepUp(secondary, {awaitReplicationBeforeStepUp: false}); testDB = newPrimary.getDB(dbName); testColl = testDB.getCollection(collName); diff --git a/jstests/replsets/rslib.js b/jstests/replsets/rslib.js index 3b385e70e29..6f397b08ca4 100644 --- a/jstests/replsets/rslib.js +++ b/jstests/replsets/rslib.js @@ -500,7 +500,8 @@ stopReplicationAndEnforceNewPrimaryToCatchUp = function(rst, node) { const latestOpOnOldPrimary = getLatestOp(oldPrimary); // New primary wins immediately, but needs to catch up. - const newPrimary = rst.stepUpNoAwaitReplication(node); + const newPrimary = + rst.stepUp(node, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); const latestOpOnNewPrimary = getLatestOp(newPrimary); // Check this node is not writable. assert.eq(newPrimary.getDB("test").isMaster().ismaster, false); diff --git a/jstests/replsets/step_down_during_draining.js b/jstests/replsets/step_down_during_draining.js index 53009c66f36..dde02adff58 100644 --- a/jstests/replsets/step_down_during_draining.js +++ b/jstests/replsets/step_down_during_draining.js @@ -79,7 +79,7 @@ assert.soon( 1000); reconnect(secondary); -replSet.stepUpNoAwaitReplication(secondary); +replSet.stepUp(secondary, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); // Secondary doesn't allow writes yet. var res = secondary.getDB("admin").runCommand({"isMaster": 1}); @@ -95,10 +95,10 @@ assert.commandFailedWithCode( // Original primary steps up. reconnect(primary); -replSet.stepUpNoAwaitReplication(primary); +replSet.stepUp(primary, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); reconnect(secondary); -replSet.stepUpNoAwaitReplication(secondary); +replSet.stepUp(secondary, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); // Disable fail point to allow replication. secondaries.forEach(disableFailPoint); diff --git a/jstests/replsets/step_down_during_draining2.js b/jstests/replsets/step_down_during_draining2.js index 5b972cfb51a..366854350c1 100644 --- a/jstests/replsets/step_down_during_draining2.js +++ b/jstests/replsets/step_down_during_draining2.js @@ -81,7 +81,7 @@ assert.soon( 1000); reconnect(secondary); -replSet.stepUpNoAwaitReplication(secondary); +replSet.stepUp(secondary, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); // Secondary doesn't allow writes yet. var res = secondary.getDB("admin").runCommand({"isMaster": 1}); @@ -134,7 +134,7 @@ assert(!secondary.adminCommand('ismaster').ismaster); // Now ensure that the node can successfully become primary again. replSet.restart(0); replSet.restart(2); -replSet.stepUpNoAwaitReplication(secondary); +replSet.stepUp(secondary, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); assert.soon(function() { return secondary.adminCommand('ismaster').ismaster; diff --git a/jstests/replsets/step_down_during_draining3.js b/jstests/replsets/step_down_during_draining3.js index ac1f2239498..9cd965d48cd 100644 --- a/jstests/replsets/step_down_during_draining3.js +++ b/jstests/replsets/step_down_during_draining3.js @@ -71,7 +71,7 @@ assert.soon( 1000); reconnect(secondary); -replSet.stepUpNoAwaitReplication(secondary); +replSet.stepUp(secondary, {awaitReplicationBeforeStepUp: false, awaitWritablePrimary: false}); // Secondary doesn't allow writes yet. var res = secondary.getDB("admin").runCommand({"isMaster": 1}); diff --git a/src/mongo/shell/assert.js b/src/mongo/shell/assert.js index 20c8f2d481b..5c06110b4ad 100644 --- a/src/mongo/shell/assert.js +++ b/src/mongo/shell/assert.js @@ -351,7 +351,19 @@ assert = (function() { assert.soonNoExcept = function(func, msg, timeout, interval) { var safeFunc = _convertExceptionToReturnStatus(func, "assert.soonNoExcept caught exception"); - assert.soon(safeFunc, msg, timeout, interval); + var safeFuncwithMinimizedNoise = () => { + // Turns off printing the JavaScript stacktrace in doassert() to avoid generating an + // overwhelming amount of log messages when handling transient errors. + const origTraceExceptions = TestData.traceExceptions; + TestData.traceExceptions = false; + + const res = safeFunc(); + + // Restore it's value to original value. + TestData.traceExceptions = origTraceExceptions; + return res; + }; + assert.soon(safeFuncwithMinimizedNoise, msg, timeout, interval); }; /* diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index 1e5bf078159..481c1f60339 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -853,7 +853,7 @@ var ReplSetTest = function(opts) { print("AwaitNodesAgreeOnPrimary: Nodes agreed on primary " + nodes[primary].name); return true; - }, "Awaiting nodes to agree on primary", timeout); + }, "Awaiting nodes to agree on primary timed out", timeout); }; /** @@ -1204,40 +1204,56 @@ var ReplSetTest = function(opts) { }; /** - * Steps up 'node' as primary. - * Waits for all nodes to reach the same optime before sending the replSetStepUp command - * to 'node'. + * Steps up 'node' as primary and by default it waits for the stepped up node to become a + * writable primary and waits for all nodes to reach the same optime before sending the + * replSetStepUp command to 'node'. + * * Calls awaitReplication() which requires all connections in 'nodes' to be authenticated. + * This stepUp() assumes that there is no network partition in the replica set. */ - this.stepUp = function(node) { - this.awaitReplication(); - this.awaitNodesAgreeOnAppliedOpTime(); - this.awaitNodesAgreeOnPrimary(); - if (this.getPrimary() === node) { - return; - } + this.stepUp = function(node, { + awaitReplicationBeforeStepUp: awaitReplicationBeforeStepUp = true, + awaitWritablePrimary: awaitWritablePrimary = true + } = {}) { + jsTest.log("ReplSetTest stepUp: Stepping up " + node.host); + + if (awaitReplicationBeforeStepUp) { + this.awaitReplication(); + } + + assert.soonNoExcept(() => { + const res = node.adminCommand({replSetStepUp: 1}); + // This error is possible if we are running mongoDB binary < 3.4 as + // part of multi-version upgrade test. So, for those older branches, + // simply wait for the requested node to get elected as primary due + // to election timeout. + if (!res.ok && res.code === ErrorCodes.CommandNotFound) { + jsTest.log( + 'replSetStepUp command not supported on node ' + node.host + + " ; so wait for the requested node to get elected due to election timeout."); + if (this.getPrimary() === node) { + return true; + } + } + assert.commandWorked(res); - jsTest.log("Stepping up: " + node.host + " in stepUp"); - assert.commandWorked(node.adminCommand({replSetStepUp: 1})); - this.awaitNodesAgreeOnPrimary(); - assert.eq(this.getPrimary(), node, 'failed to step up node ' + node.host + ' in stepUp'); - }; + // Since assert.soon() timeout is 10 minutes (default), setting + // awaitNodesAgreeOnPrimary() timeout as 1 minute to allow retry of replSetStepUp + // command on failure of the replica set to agree on the primary. + const timeout = 60 * 100; + this.awaitNodesAgreeOnPrimary(timeout, this.nodes, this.getNodeId(node)); - /** - * Steps up 'node' as primary. - */ - this.stepUpNoAwaitReplication = function(node) { - jsTest.log("Stepping up: " + node.host + " in stepUpNoAwaitReplication"); - assert.soonNoExcept( - function() { - assert.commandWorked(node.adminCommand({replSetStepUp: 1})); - self.awaitNodesAgreeOnPrimary( - self.kDefaultTimeoutMS, self.nodes, self.getNodeId(node)); - return node.adminCommand('replSetGetStatus').myState === ReplSetTest.State.PRIMARY; - }, - 'failed to step up node ' + node.host + ' in stepUpNoAwaitReplication', - self.kDefaultTimeoutMS); + // getPrimary() guarantees that there will be only one writable primary for a replica + // set. + if (!awaitWritablePrimary || this.getPrimary() === node) { + return true; + } + + jsTest.log(node.host + ' is not primary after stepUp command'); + return false; + }, "Timed out while waiting for stepUp to succeed on node in port: " + node.port); + jsTest.log("ReplSetTest stepUp: Finished stepping up " + node.host); return node; }; @@ -2844,7 +2860,11 @@ var ReplSetTest = function(opts) { var existingNodes = conf.members.map(member => member.host); self.ports = existingNodes.map(node => node.split(':')[1]); - self.nodes = existingNodes.map(node => new Mongo(node)); + self.nodes = existingNodes.map(node => { + let conn = new Mongo(node); + conn.name = conn.host; + return conn; + }); self.waitForKeys = false; self.host = existingNodes[0].split(':')[0]; self.name = conf._id; |