summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorA. Jesse Jiryu Davis <jesse@mongodb.com>2019-09-26 12:42:41 +0000
committerevergreen <evergreen@mongodb.com>2019-09-26 12:42:41 +0000
commitfc0db84002fd553bcf0f7d7a153d79bbba75cf34 (patch)
tree4d0c0d7ec92d46e82a8b648742579cde49dd076e
parentc5540ac7d3ceb7c6ce8ff5a3354d80f3eb09dbf1 (diff)
downloadmongo-fc0db84002fd553bcf0f7d7a153d79bbba75cf34.tar.gz
Revert "SERVER-43226 ReplSetTest.stop can hang due to fsyncLock"
This reverts commit 7a9cefdabf985360ada4ac7569b0682320075edf.
-rw-r--r--jstests/replsets/libs/rollback_test.js28
-rw-r--r--src/mongo/shell/replsettest.js192
2 files changed, 93 insertions, 127 deletions
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js
index 2cf97d25a2d..0f8e1647608 100644
--- a/jstests/replsets/libs/rollback_test.js
+++ b/jstests/replsets/libs/rollback_test.js
@@ -61,6 +61,7 @@ function RollbackTest(name = "RollbackTest", replSet) {
const SIGTERM = 15;
const kNumDataBearingNodes = 3;
const kElectableNodes = 2;
+ const kForeverSecs = 24 * 60 * 60;
let rst;
let curPrimary;
@@ -376,7 +377,7 @@ function RollbackTest(name = "RollbackTest", replSet) {
// catch up to curPrimary's oplog, then the rollback node can become the new primary.
// If so, it can lead to unplanned state transitions, like unconditional step down, during
// the test. To avoid those problems, prevent rollback node from starting an election.
- assert.commandWorked(curSecondary.adminCommand({replSetFreeze: ReplSetTest.kForeverSecs}));
+ assert.commandWorked(curSecondary.adminCommand({replSetFreeze: kForeverSecs}));
log(`Reconnecting the secondary ${curSecondary.host} so it'll go into rollback`);
// Reconnect the rollback node to the current primary, which is the node we want to sync
@@ -443,7 +444,30 @@ function RollbackTest(name = "RollbackTest", replSet) {
// Freeze the node if the restarted node is the rollback node.
if (curState === State.kSyncSourceOpsDuringRollback &&
rst.getNodeId(curSecondary) === nodeId) {
- rst.freeze(nodeId);
+ assert.soon(() => {
+ try {
+ // Try stepping down the rollback node if it became the primary after its
+ // restart, as it might have caught up with the original primary and facing
+ // arbitrary machine/network slowness.
+ curSecondary.adminCommand({"replSetStepDown": kForeverSecs, "force": true});
+ // Prevent rollback node from running election. There is a chance that this
+ // node might have started running election or became primary after
+ // 'replSetStepDown' cmd, so 'replSetFreeze' cmd can fail.
+ assert.commandWorked(
+ curSecondary.adminCommand({"replSetFreeze": kForeverSecs}));
+ return true;
+ } catch (e) {
+ // Network error can happen if the node simultaneously tries to transition to
+ // ROLLBACK state.
+ if (isNetworkError(e) || e.code === ErrorCodes.NotSecondary ||
+ e.code === ErrorCodes.NotYetInitialized) {
+ log('Failed to freeze the node.' + tojson(e));
+ return false;
+ }
+
+ throw e;
+ }
+ }, `Failed to run replSetFreeze cmd on ${curSecondary.host}`);
}
const oldPrimary = curPrimary;
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index 3fb7f3fee37..cf2cc3a80c1 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -195,55 +195,6 @@ var ReplSetTest = function(opts) {
}
/**
- * Wrap a function so it can accept a node id or connection as its first argument. The argument
- * is converted to a connection.
- */
- function _nodeParamToConn(wrapped) {
- return function(node, ...wrappedArgs) {
- if (node.getDB) {
- return wrapped.call(this, node, ...wrappedArgs);
- }
-
- assert(self.nodes.hasOwnProperty(node), `${node} not found in self.nodes`);
- return wrapped.call(this, self.nodes[node], ...wrappedArgs);
- };
- }
-
- /**
- * Wrap a function so it can accept a node id or connection as its first argument. The argument
- * is converted to a node id.
- */
- function _nodeParamToId(wrapped) {
- return function(node, ...wrappedArgs) {
- if (node.getDB) {
- return wrapped.call(this, self.getNodeId(node), ...wrappedArgs);
- }
-
- assert(Number.isInteger(node), `node must be an integer, not ${node}`);
- return wrapped.call(this, node, ...wrappedArgs);
- };
- }
-
- /**
- * Wrap a function so it accepts a single node or list of them as its first argument. The
- * function is called once per node provided.
- */
- function _nodeParamToSingleNode(wrapped) {
- return function(node, ...wrappedArgs) {
- if (node.hasOwnProperty('length')) {
- let returnValueList = [];
- for (let i = 0; i < node.length; i++) {
- returnValueList.push(wrapped.call(this, node[i], ...wrappedArgs));
- }
-
- return returnValueList;
- }
-
- return wrapped.call(this, node, ...wrappedArgs);
- };
- }
-
- /**
* Wait for a rs indicator to go to a particular state or states.
*
* @param node is a single node or list of nodes, by id or conn
@@ -252,13 +203,28 @@ var ReplSetTest = function(opts) {
* @param timeout how long to wait for the state to be reached
* @param reconnectNode indicates that we should reconnect to a node that stepped down
*/
- const _waitForIndicator = _nodeParamToSingleNode(_nodeParamToConn(function(
- node, states, ind, timeout, reconnectNode) {
+ function _waitForIndicator(node, states, ind, timeout, reconnectNode) {
timeout = timeout || self.kDefaultTimeoutMS;
if (reconnectNode === undefined) {
reconnectNode = true;
}
+ if (node.length) {
+ var nodes = node;
+ for (var i = 0; i < nodes.length; i++) {
+ if (states.length)
+ _waitForIndicator(nodes[i], states[i], ind, timeout, reconnectNode);
+ else
+ _waitForIndicator(nodes[i], states, ind, timeout, reconnectNode);
+ }
+
+ return;
+ }
+
+ if (!node.getDB) {
+ node = self.nodes[node];
+ }
+
if (!states.length) {
states = [states];
}
@@ -352,7 +318,7 @@ var ReplSetTest = function(opts) {
print("ReplSetTest waitForIndicator final status:");
printjson(status);
- }));
+ }
/**
* Wait for a health indicator to go to a particular state or states.
@@ -1838,54 +1804,32 @@ var ReplSetTest = function(opts) {
});
}
- // Prevent an election, which could start, then hang due to the fsyncLock.
- // First copy _slaves.
- const frozenNodes = [...self._slaves];
- self.freeze(frozenNodes);
-
- // Await primary in case freeze() had to step down a node that was unexpectedly primary.
- self.getPrimary();
+ var activeException = false;
- // Lock the primary to prevent writes in the background while we are getting the
- // dbhashes of the replica set members. It's not important if the storage engine fails
- // to perform its fsync operation. The only requirement is that writes are locked out.
+ // Lock the primary to prevent the TTL monitor from deleting expired documents in
+ // the background while we are getting the dbhashes of the replica set members. It's not
+ // important if the storage engine fails to perform its fsync operation. The only
+ // requirement is that writes are locked out.
assert.commandWorked(primary.adminCommand({fsync: 1, lock: 1, allowFsyncFailure: true}),
'failed to lock the primary');
-
- function postApplyCheckerFunction() {
- // Unfreeze secondaries and unlock primary.
- try {
- assert.commandWorked(primary.adminCommand({fsyncUnlock: 1}));
- } catch (e) {
- print(`Continuing after fsyncUnlock error: ${e}`);
- }
-
- frozenNodes.forEach(secondary => {
- try {
- assert.commandWorked(secondary.adminCommand({replSetFreeze: 0}));
- } catch (e) {
- print(`Continuing after replSetFreeze error: ${e}`);
- }
- });
- }
-
- let activeException = false;
try {
- self.awaitReplication(null, null, slaves);
+ this.awaitReplication(null, null, slaves);
checkerFunction.apply(this, checkerFunctionArgs);
} catch (e) {
activeException = true;
throw e;
} finally {
- if (activeException) {
- try {
- postApplyCheckerFunction();
- } catch (e) {
- // Print the postApplyCheckerFunction error, propagate the original.
- print(e);
+ // Allow writes on the primary.
+ var res = primary.adminCommand({fsyncUnlock: 1});
+
+ if (!res.ok) {
+ var msg = 'failed to unlock the primary, which may cause this' +
+ ' test to hang: ' + tojson(res);
+ if (activeException) {
+ print(msg);
+ } else {
+ throw new Error(msg);
}
- } else {
- postApplyCheckerFunction();
}
}
};
@@ -2316,7 +2260,23 @@ var ReplSetTest = function(opts) {
* before the server starts. Default: false.
*
*/
- this.start = _nodeParamToSingleNode(_nodeParamToId(function(n, options, restart, wait) {
+ this.start = function(n, options, restart, wait) {
+ if (n.length) {
+ var nodes = n;
+ var started = [];
+
+ for (var i = 0; i < nodes.length; i++) {
+ if (this.start(nodes[i], Object.merge({}, options), restart, wait)) {
+ started.push(nodes[i]);
+ }
+ }
+
+ return started;
+ }
+
+ // TODO: should we do something special if we don't currently know about this node?
+ n = this.getNodeId(n);
+
print("ReplSetTest n is : " + n);
var defaults = {
@@ -2445,7 +2405,7 @@ var ReplSetTest = function(opts) {
}
return this.nodes[n];
- }));
+ };
/**
* Restarts a db without clearing the data directory by default, and using the node(s)'s
@@ -2484,31 +2444,6 @@ var ReplSetTest = function(opts) {
return started;
};
- /**
- * Step down and freeze a particular node or nodes.
- *
- * @param node is a single node or list of nodes, by id or conn
- */
- this.freeze = _nodeParamToSingleNode(_nodeParamToConn(function(node) {
- assert.soon(() => {
- try {
- // Ensure node is not primary. Ignore errors, probably means it's already secondary.
- node.adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: true});
- // Prevent node from running election. Fails if it already started an election.
- assert.commandWorked(node.adminCommand({replSetFreeze: ReplSetTest.kForeverSecs}));
- return true;
- } catch (e) {
- if (isNetworkError(e) || e.code === ErrorCodes.NotSecondary ||
- e.code === ErrorCodes.NotYetInitialized) {
- jsTestLog(`Failed to freeze node ${node.host}: ${e}`);
- return false;
- }
-
- throw e;
- }
- }, `Failed to run replSetFreeze cmd on ${node.host}`);
- }));
-
this.stopMaster = function(signal, opts) {
var master = this.getPrimary();
var master_id = this.getNodeId(master);
@@ -2529,8 +2464,20 @@ var ReplSetTest = function(opts) {
* @param {boolean} [extraOptions.forRestart=false] indicates whether stop() is being called
* with the intent to call start() with restart=true for the same node(s) n.
*/
- this.stop = _nodeParamToSingleNode(_nodeParamToConn(function(
- n, signal, opts, {forRestart: forRestart = false} = {}) {
+ this.stop = function(n, signal, opts, {forRestart: forRestart = false} = {}) {
+ // Flatten array of nodes to stop
+ if (n.length) {
+ var nodes = n;
+
+ var stopped = [];
+ for (var i = 0; i < nodes.length; i++) {
+ if (this.stop(nodes[i], signal, opts))
+ stopped.push(nodes[i]);
+ }
+
+ return stopped;
+ }
+
// Can specify wait as second parameter, if using default signal
if (signal == true || signal == false) {
signal = undefined;
@@ -2555,7 +2502,7 @@ var ReplSetTest = function(opts) {
}
return ret;
- }));
+ };
/**
* Kill all members of this replica set.
@@ -2802,11 +2749,6 @@ var ReplSetTest = function(opts) {
ReplSetTest.kDefaultTimeoutMS = 10 * 60 * 1000;
/**
- * Global default number that's effectively infinite.
- */
-ReplSetTest.kForeverSecs = 24 * 60 * 60;
-
-/**
* Set of states that the replica set can be in. Used for the wait functions.
*/
ReplSetTest.State = {