diff options
author | A. Jesse Jiryu Davis <jesse@mongodb.com> | 2019-09-26 12:42:41 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-09-26 12:42:41 +0000 |
commit | fc0db84002fd553bcf0f7d7a153d79bbba75cf34 (patch) | |
tree | 4d0c0d7ec92d46e82a8b648742579cde49dd076e | |
parent | c5540ac7d3ceb7c6ce8ff5a3354d80f3eb09dbf1 (diff) | |
download | mongo-fc0db84002fd553bcf0f7d7a153d79bbba75cf34.tar.gz |
Revert "SERVER-43226 ReplSetTest.stop can hang due to fsyncLock"
This reverts commit 7a9cefdabf985360ada4ac7569b0682320075edf.
-rw-r--r-- | jstests/replsets/libs/rollback_test.js | 28 | ||||
-rw-r--r-- | src/mongo/shell/replsettest.js | 192 |
2 files changed, 93 insertions, 127 deletions
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js index 2cf97d25a2d..0f8e1647608 100644 --- a/jstests/replsets/libs/rollback_test.js +++ b/jstests/replsets/libs/rollback_test.js @@ -61,6 +61,7 @@ function RollbackTest(name = "RollbackTest", replSet) { const SIGTERM = 15; const kNumDataBearingNodes = 3; const kElectableNodes = 2; + const kForeverSecs = 24 * 60 * 60; let rst; let curPrimary; @@ -376,7 +377,7 @@ function RollbackTest(name = "RollbackTest", replSet) { // catch up to curPrimary's oplog, then the rollback node can become the new primary. // If so, it can lead to unplanned state transitions, like unconditional step down, during // the test. To avoid those problems, prevent rollback node from starting an election. - assert.commandWorked(curSecondary.adminCommand({replSetFreeze: ReplSetTest.kForeverSecs})); + assert.commandWorked(curSecondary.adminCommand({replSetFreeze: kForeverSecs})); log(`Reconnecting the secondary ${curSecondary.host} so it'll go into rollback`); // Reconnect the rollback node to the current primary, which is the node we want to sync @@ -443,7 +444,30 @@ function RollbackTest(name = "RollbackTest", replSet) { // Freeze the node if the restarted node is the rollback node. if (curState === State.kSyncSourceOpsDuringRollback && rst.getNodeId(curSecondary) === nodeId) { - rst.freeze(nodeId); + assert.soon(() => { + try { + // Try stepping down the rollback node if it became the primary after its + // restart, as it might have caught up with the original primary and facing + // arbitrary machine/network slowness. + curSecondary.adminCommand({"replSetStepDown": kForeverSecs, "force": true}); + // Prevent rollback node from running election. There is a chance that this + // node might have started running election or became primary after + // 'replSetStepDown' cmd, so 'replSetFreeze' cmd can fail. + assert.commandWorked( + curSecondary.adminCommand({"replSetFreeze": kForeverSecs})); + return true; + } catch (e) { + // Network error can happen if the node simultaneously tries to transition to + // ROLLBACK state. + if (isNetworkError(e) || e.code === ErrorCodes.NotSecondary || + e.code === ErrorCodes.NotYetInitialized) { + log('Failed to freeze the node.' + tojson(e)); + return false; + } + + throw e; + } + }, `Failed to run replSetFreeze cmd on ${curSecondary.host}`); } const oldPrimary = curPrimary; diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index 3fb7f3fee37..cf2cc3a80c1 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -195,55 +195,6 @@ var ReplSetTest = function(opts) { } /** - * Wrap a function so it can accept a node id or connection as its first argument. The argument - * is converted to a connection. - */ - function _nodeParamToConn(wrapped) { - return function(node, ...wrappedArgs) { - if (node.getDB) { - return wrapped.call(this, node, ...wrappedArgs); - } - - assert(self.nodes.hasOwnProperty(node), `${node} not found in self.nodes`); - return wrapped.call(this, self.nodes[node], ...wrappedArgs); - }; - } - - /** - * Wrap a function so it can accept a node id or connection as its first argument. The argument - * is converted to a node id. - */ - function _nodeParamToId(wrapped) { - return function(node, ...wrappedArgs) { - if (node.getDB) { - return wrapped.call(this, self.getNodeId(node), ...wrappedArgs); - } - - assert(Number.isInteger(node), `node must be an integer, not ${node}`); - return wrapped.call(this, node, ...wrappedArgs); - }; - } - - /** - * Wrap a function so it accepts a single node or list of them as its first argument. The - * function is called once per node provided. - */ - function _nodeParamToSingleNode(wrapped) { - return function(node, ...wrappedArgs) { - if (node.hasOwnProperty('length')) { - let returnValueList = []; - for (let i = 0; i < node.length; i++) { - returnValueList.push(wrapped.call(this, node[i], ...wrappedArgs)); - } - - return returnValueList; - } - - return wrapped.call(this, node, ...wrappedArgs); - }; - } - - /** * Wait for a rs indicator to go to a particular state or states. * * @param node is a single node or list of nodes, by id or conn @@ -252,13 +203,28 @@ var ReplSetTest = function(opts) { * @param timeout how long to wait for the state to be reached * @param reconnectNode indicates that we should reconnect to a node that stepped down */ - const _waitForIndicator = _nodeParamToSingleNode(_nodeParamToConn(function( - node, states, ind, timeout, reconnectNode) { + function _waitForIndicator(node, states, ind, timeout, reconnectNode) { timeout = timeout || self.kDefaultTimeoutMS; if (reconnectNode === undefined) { reconnectNode = true; } + if (node.length) { + var nodes = node; + for (var i = 0; i < nodes.length; i++) { + if (states.length) + _waitForIndicator(nodes[i], states[i], ind, timeout, reconnectNode); + else + _waitForIndicator(nodes[i], states, ind, timeout, reconnectNode); + } + + return; + } + + if (!node.getDB) { + node = self.nodes[node]; + } + if (!states.length) { states = [states]; } @@ -352,7 +318,7 @@ var ReplSetTest = function(opts) { print("ReplSetTest waitForIndicator final status:"); printjson(status); - })); + } /** * Wait for a health indicator to go to a particular state or states. @@ -1838,54 +1804,32 @@ var ReplSetTest = function(opts) { }); } - // Prevent an election, which could start, then hang due to the fsyncLock. - // First copy _slaves. - const frozenNodes = [...self._slaves]; - self.freeze(frozenNodes); - - // Await primary in case freeze() had to step down a node that was unexpectedly primary. - self.getPrimary(); + var activeException = false; - // Lock the primary to prevent writes in the background while we are getting the - // dbhashes of the replica set members. It's not important if the storage engine fails - // to perform its fsync operation. The only requirement is that writes are locked out. + // Lock the primary to prevent the TTL monitor from deleting expired documents in + // the background while we are getting the dbhashes of the replica set members. It's not + // important if the storage engine fails to perform its fsync operation. The only + // requirement is that writes are locked out. assert.commandWorked(primary.adminCommand({fsync: 1, lock: 1, allowFsyncFailure: true}), 'failed to lock the primary'); - - function postApplyCheckerFunction() { - // Unfreeze secondaries and unlock primary. - try { - assert.commandWorked(primary.adminCommand({fsyncUnlock: 1})); - } catch (e) { - print(`Continuing after fsyncUnlock error: ${e}`); - } - - frozenNodes.forEach(secondary => { - try { - assert.commandWorked(secondary.adminCommand({replSetFreeze: 0})); - } catch (e) { - print(`Continuing after replSetFreeze error: ${e}`); - } - }); - } - - let activeException = false; try { - self.awaitReplication(null, null, slaves); + this.awaitReplication(null, null, slaves); checkerFunction.apply(this, checkerFunctionArgs); } catch (e) { activeException = true; throw e; } finally { - if (activeException) { - try { - postApplyCheckerFunction(); - } catch (e) { - // Print the postApplyCheckerFunction error, propagate the original. - print(e); + // Allow writes on the primary. + var res = primary.adminCommand({fsyncUnlock: 1}); + + if (!res.ok) { + var msg = 'failed to unlock the primary, which may cause this' + + ' test to hang: ' + tojson(res); + if (activeException) { + print(msg); + } else { + throw new Error(msg); } - } else { - postApplyCheckerFunction(); } } }; @@ -2316,7 +2260,23 @@ var ReplSetTest = function(opts) { * before the server starts. Default: false. * */ - this.start = _nodeParamToSingleNode(_nodeParamToId(function(n, options, restart, wait) { + this.start = function(n, options, restart, wait) { + if (n.length) { + var nodes = n; + var started = []; + + for (var i = 0; i < nodes.length; i++) { + if (this.start(nodes[i], Object.merge({}, options), restart, wait)) { + started.push(nodes[i]); + } + } + + return started; + } + + // TODO: should we do something special if we don't currently know about this node? + n = this.getNodeId(n); + print("ReplSetTest n is : " + n); var defaults = { @@ -2445,7 +2405,7 @@ var ReplSetTest = function(opts) { } return this.nodes[n]; - })); + }; /** * Restarts a db without clearing the data directory by default, and using the node(s)'s @@ -2484,31 +2444,6 @@ var ReplSetTest = function(opts) { return started; }; - /** - * Step down and freeze a particular node or nodes. - * - * @param node is a single node or list of nodes, by id or conn - */ - this.freeze = _nodeParamToSingleNode(_nodeParamToConn(function(node) { - assert.soon(() => { - try { - // Ensure node is not primary. Ignore errors, probably means it's already secondary. - node.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true}); - // Prevent node from running election. Fails if it already started an election. - assert.commandWorked(node.adminCommand({replSetFreeze: ReplSetTest.kForeverSecs})); - return true; - } catch (e) { - if (isNetworkError(e) || e.code === ErrorCodes.NotSecondary || - e.code === ErrorCodes.NotYetInitialized) { - jsTestLog(`Failed to freeze node ${node.host}: ${e}`); - return false; - } - - throw e; - } - }, `Failed to run replSetFreeze cmd on ${node.host}`); - })); - this.stopMaster = function(signal, opts) { var master = this.getPrimary(); var master_id = this.getNodeId(master); @@ -2529,8 +2464,20 @@ var ReplSetTest = function(opts) { * @param {boolean} [extraOptions.forRestart=false] indicates whether stop() is being called * with the intent to call start() with restart=true for the same node(s) n. */ - this.stop = _nodeParamToSingleNode(_nodeParamToConn(function( - n, signal, opts, {forRestart: forRestart = false} = {}) { + this.stop = function(n, signal, opts, {forRestart: forRestart = false} = {}) { + // Flatten array of nodes to stop + if (n.length) { + var nodes = n; + + var stopped = []; + for (var i = 0; i < nodes.length; i++) { + if (this.stop(nodes[i], signal, opts)) + stopped.push(nodes[i]); + } + + return stopped; + } + // Can specify wait as second parameter, if using default signal if (signal == true || signal == false) { signal = undefined; @@ -2555,7 +2502,7 @@ var ReplSetTest = function(opts) { } return ret; - })); + }; /** * Kill all members of this replica set. @@ -2802,11 +2749,6 @@ var ReplSetTest = function(opts) { ReplSetTest.kDefaultTimeoutMS = 10 * 60 * 1000; /** - * Global default number that's effectively infinite. - */ -ReplSetTest.kForeverSecs = 24 * 60 * 60; - -/** * Set of states that the replica set can be in. Used for the wait functions. */ ReplSetTest.State = { |