summaryrefslogtreecommitdiff
path: root/src/mongo/shell
diff options
context:
space:
mode:
authorA. Jesse Jiryu Davis <jesse@mongodb.com>2019-09-26 00:39:44 +0000
committerevergreen <evergreen@mongodb.com>2019-09-26 00:39:44 +0000
commit7a9cefdabf985360ada4ac7569b0682320075edf (patch)
tree4536876eb6bf4fea63e079c3b503890360f2d2dc /src/mongo/shell
parent58619f8c5afffe129d1b2eebbbf0770efa0b331f (diff)
downloadmongo-7a9cefdabf985360ada4ac7569b0682320075edf.tar.gz
SERVER-43226 ReplSetTest.stop can hang due to fsyncLock
Diffstat (limited to 'src/mongo/shell')
-rw-r--r--src/mongo/shell/replsettest.js192
1 files changed, 125 insertions, 67 deletions
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index cf2cc3a80c1..3fb7f3fee37 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -195,6 +195,55 @@ var ReplSetTest = function(opts) {
}
/**
+ * Wrap a function so it can accept a node id or connection as its first argument. The argument
+ * is converted to a connection.
+ */
+ function _nodeParamToConn(wrapped) {
+ return function(node, ...wrappedArgs) {
+ if (node.getDB) {
+ return wrapped.call(this, node, ...wrappedArgs);
+ }
+
+ assert(self.nodes.hasOwnProperty(node), `${node} not found in self.nodes`);
+ return wrapped.call(this, self.nodes[node], ...wrappedArgs);
+ };
+ }
+
+ /**
+ * Wrap a function so it can accept a node id or connection as its first argument. The argument
+ * is converted to a node id.
+ */
+ function _nodeParamToId(wrapped) {
+ return function(node, ...wrappedArgs) {
+ if (node.getDB) {
+ return wrapped.call(this, self.getNodeId(node), ...wrappedArgs);
+ }
+
+ assert(Number.isInteger(node), `node must be an integer, not ${node}`);
+ return wrapped.call(this, node, ...wrappedArgs);
+ };
+ }
+
+ /**
+ * Wrap a function so it accepts a single node or list of them as its first argument. The
+ * function is called once per node provided.
+ */
+ function _nodeParamToSingleNode(wrapped) {
+ return function(node, ...wrappedArgs) {
+ if (node.hasOwnProperty('length')) {
+ let returnValueList = [];
+ for (let i = 0; i < node.length; i++) {
+ returnValueList.push(wrapped.call(this, node[i], ...wrappedArgs));
+ }
+
+ return returnValueList;
+ }
+
+ return wrapped.call(this, node, ...wrappedArgs);
+ };
+ }
+
+ /**
* Wait for a rs indicator to go to a particular state or states.
*
* @param node is a single node or list of nodes, by id or conn
@@ -203,28 +252,13 @@ var ReplSetTest = function(opts) {
* @param timeout how long to wait for the state to be reached
* @param reconnectNode indicates that we should reconnect to a node that stepped down
*/
- function _waitForIndicator(node, states, ind, timeout, reconnectNode) {
+ const _waitForIndicator = _nodeParamToSingleNode(_nodeParamToConn(function(
+ node, states, ind, timeout, reconnectNode) {
timeout = timeout || self.kDefaultTimeoutMS;
if (reconnectNode === undefined) {
reconnectNode = true;
}
- if (node.length) {
- var nodes = node;
- for (var i = 0; i < nodes.length; i++) {
- if (states.length)
- _waitForIndicator(nodes[i], states[i], ind, timeout, reconnectNode);
- else
- _waitForIndicator(nodes[i], states, ind, timeout, reconnectNode);
- }
-
- return;
- }
-
- if (!node.getDB) {
- node = self.nodes[node];
- }
-
if (!states.length) {
states = [states];
}
@@ -318,7 +352,7 @@ var ReplSetTest = function(opts) {
print("ReplSetTest waitForIndicator final status:");
printjson(status);
- }
+ }));
/**
* Wait for a health indicator to go to a particular state or states.
@@ -1804,32 +1838,54 @@ var ReplSetTest = function(opts) {
});
}
- var activeException = false;
+ // Prevent an election, which could start, then hang due to the fsyncLock.
+ // First copy _slaves.
+ const frozenNodes = [...self._slaves];
+ self.freeze(frozenNodes);
+
+ // Await primary in case freeze() had to step down a node that was unexpectedly primary.
+ self.getPrimary();
- // Lock the primary to prevent the TTL monitor from deleting expired documents in
- // the background while we are getting the dbhashes of the replica set members. It's not
- // important if the storage engine fails to perform its fsync operation. The only
- // requirement is that writes are locked out.
+ // Lock the primary to prevent writes in the background while we are getting the
+ // dbhashes of the replica set members. It's not important if the storage engine fails
+ // to perform its fsync operation. The only requirement is that writes are locked out.
assert.commandWorked(primary.adminCommand({fsync: 1, lock: 1, allowFsyncFailure: true}),
'failed to lock the primary');
+
+ function postApplyCheckerFunction() {
+ // Unfreeze secondaries and unlock primary.
+ try {
+ assert.commandWorked(primary.adminCommand({fsyncUnlock: 1}));
+ } catch (e) {
+ print(`Continuing after fsyncUnlock error: ${e}`);
+ }
+
+ frozenNodes.forEach(secondary => {
+ try {
+ assert.commandWorked(secondary.adminCommand({replSetFreeze: 0}));
+ } catch (e) {
+ print(`Continuing after replSetFreeze error: ${e}`);
+ }
+ });
+ }
+
+ let activeException = false;
try {
- this.awaitReplication(null, null, slaves);
+ self.awaitReplication(null, null, slaves);
checkerFunction.apply(this, checkerFunctionArgs);
} catch (e) {
activeException = true;
throw e;
} finally {
- // Allow writes on the primary.
- var res = primary.adminCommand({fsyncUnlock: 1});
-
- if (!res.ok) {
- var msg = 'failed to unlock the primary, which may cause this' +
- ' test to hang: ' + tojson(res);
- if (activeException) {
- print(msg);
- } else {
- throw new Error(msg);
+ if (activeException) {
+ try {
+ postApplyCheckerFunction();
+ } catch (e) {
+ // Print the postApplyCheckerFunction error, propagate the original.
+ print(e);
}
+ } else {
+ postApplyCheckerFunction();
}
}
};
@@ -2260,23 +2316,7 @@ var ReplSetTest = function(opts) {
* before the server starts. Default: false.
*
*/
- this.start = function(n, options, restart, wait) {
- if (n.length) {
- var nodes = n;
- var started = [];
-
- for (var i = 0; i < nodes.length; i++) {
- if (this.start(nodes[i], Object.merge({}, options), restart, wait)) {
- started.push(nodes[i]);
- }
- }
-
- return started;
- }
-
- // TODO: should we do something special if we don't currently know about this node?
- n = this.getNodeId(n);
-
+ this.start = _nodeParamToSingleNode(_nodeParamToId(function(n, options, restart, wait) {
print("ReplSetTest n is : " + n);
var defaults = {
@@ -2405,7 +2445,7 @@ var ReplSetTest = function(opts) {
}
return this.nodes[n];
- };
+ }));
/**
* Restarts a db without clearing the data directory by default, and using the node(s)'s
@@ -2444,6 +2484,31 @@ var ReplSetTest = function(opts) {
return started;
};
+ /**
+ * Step down and freeze a particular node or nodes.
+ *
+ * @param node is a single node or list of nodes, by id or conn
+ */
+ this.freeze = _nodeParamToSingleNode(_nodeParamToConn(function(node) {
+ assert.soon(() => {
+ try {
+ // Ensure node is not primary. Ignore errors, probably means it's already secondary.
+ node.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true});
+ // Prevent node from running election. Fails if it already started an election.
+ assert.commandWorked(node.adminCommand({replSetFreeze: ReplSetTest.kForeverSecs}));
+ return true;
+ } catch (e) {
+ if (isNetworkError(e) || e.code === ErrorCodes.NotSecondary ||
+ e.code === ErrorCodes.NotYetInitialized) {
+ jsTestLog(`Failed to freeze node ${node.host}: ${e}`);
+ return false;
+ }
+
+ throw e;
+ }
+ }, `Failed to run replSetFreeze cmd on ${node.host}`);
+ }));
+
this.stopMaster = function(signal, opts) {
var master = this.getPrimary();
var master_id = this.getNodeId(master);
@@ -2464,20 +2529,8 @@ var ReplSetTest = function(opts) {
* @param {boolean} [extraOptions.forRestart=false] indicates whether stop() is being called
* with the intent to call start() with restart=true for the same node(s) n.
*/
- this.stop = function(n, signal, opts, {forRestart: forRestart = false} = {}) {
- // Flatten array of nodes to stop
- if (n.length) {
- var nodes = n;
-
- var stopped = [];
- for (var i = 0; i < nodes.length; i++) {
- if (this.stop(nodes[i], signal, opts))
- stopped.push(nodes[i]);
- }
-
- return stopped;
- }
-
+ this.stop = _nodeParamToSingleNode(_nodeParamToConn(function(
+ n, signal, opts, {forRestart: forRestart = false} = {}) {
// Can specify wait as second parameter, if using default signal
if (signal == true || signal == false) {
signal = undefined;
@@ -2502,7 +2555,7 @@ var ReplSetTest = function(opts) {
}
return ret;
- };
+ }));
/**
* Kill all members of this replica set.
@@ -2749,6 +2802,11 @@ var ReplSetTest = function(opts) {
ReplSetTest.kDefaultTimeoutMS = 10 * 60 * 1000;
/**
+ * Global default number that's effectively infinite.
+ */
+ReplSetTest.kForeverSecs = 24 * 60 * 60;
+
+/**
* Set of states that the replica set can be in. Used for the wait functions.
*/
ReplSetTest.State = {