summaryrefslogtreecommitdiff
path: root/jstests/concurrency
diff options
context:
space:
mode:
authorMax Hirschhorn <max.hirschhorn@mongodb.com>2020-03-11 22:52:25 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-03-12 04:48:36 +0000
commit5eeb0955011cf96d0218ac0a9d7f54adc9584173 (patch)
tree48c2935e4aef2a7b7ebbd4df38475c4b833b35ee /jstests/concurrency
parent2dba93df686147f88bc13486365b1cae86958c7f (diff)
downloadmongo-5eeb0955011cf96d0218ac0a9d7f54adc9584173.tar.gz
SERVER-42192 Enable moveChunk FSM workloads to run in stepdown suites.
Adds automatic retry logic to ChunkHelper.moveChunk() to handle when the CSRS or replica set shard primary being killed, terminated, or stepped down leads to the moveChunk command being interrupted. Exposes replica set connections as part of the "connection cache" so that DBClientRS may be used to track the current primary of the CSRS or replica set shard. Introduces an fsm.forceRunningOutsideTransaction() utility function to prevent a state function from running inside a multi-statement transaction as part of the concurrency_*_multi_stmt_txn*.yml test suites.
Diffstat (limited to 'jstests/concurrency')
-rw-r--r--jstests/concurrency/fsm_libs/fsm.js54
-rw-r--r--jstests/concurrency/fsm_libs/worker_thread.js1
-rw-r--r--jstests/concurrency/fsm_workload_helpers/chunks.js61
-rw-r--r--jstests/concurrency/fsm_workloads/random_moveChunk_base.js9
-rw-r--r--jstests/concurrency/fsm_workloads/sharded_base_partitioned.js2
-rw-r--r--jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js70
6 files changed, 134 insertions, 63 deletions
diff --git a/jstests/concurrency/fsm_libs/fsm.js b/jstests/concurrency/fsm_libs/fsm.js
index 3aa73477425..e3b8fc0c16b 100644
--- a/jstests/concurrency/fsm_libs/fsm.js
+++ b/jstests/concurrency/fsm_libs/fsm.js
@@ -1,6 +1,18 @@
'use strict';
var fsm = (function() {
+ const kIsRunningInsideTransaction = Symbol('isRunningInsideTransaction');
+
+ function forceRunningOutsideTransaction(data) {
+ if (data[kIsRunningInsideTransaction]) {
+ const err =
+ new Error('Intentionally thrown to stop state function from running inside of a' +
+ ' multi-statement transaction');
+ err.isNotSupported = true;
+ throw err;
+ }
+ }
+
// args.data = 'this' object of the state functions
// args.db = database object
// args.collName = collection name
@@ -9,6 +21,7 @@ var fsm = (function() {
// args.startState = name of initial state function
// args.states = state functions of the form
// { stateName: function(db, collName) { ... } }
+ // args.tid = the thread identifier
// args.transitions = transitions between state functions of the form
// { stateName: { nextState1: probability,
// nextState2: ... } }
@@ -40,14 +53,41 @@ var fsm = (function() {
return conn;
};
- connCache = {mongos: [], config: [], shards: {}};
+ const getReplSetName = (conn) => {
+ const res = assert.commandWorked(conn.getDB('admin').runCommand({isMaster: 1}));
+ assert.eq('string',
+ typeof res.setName,
+ () => `not connected to a replica set: ${tojson(res)}`);
+ return res.setName;
+ };
+
+ const makeReplSetConnWithExistingSession = (connStrList, replSetName) => {
+ const conn = makeNewConnWithExistingSession(`mongodb://${
+ connStrList.join(',')}/?appName=tid:${args.tid}&replicaSet=${replSetName}`);
+
+ return conn;
+ };
+
+ connCache =
+ {mongos: [], config: [], shards: {}, rsConns: {config: undefined, shards: {}}};
connCache.mongos = args.cluster.mongos.map(makeNewConnWithExistingSession);
connCache.config = args.cluster.config.map(makeNewConnWithExistingSession);
+ connCache.rsConns.config = makeReplSetConnWithExistingSession(
+ args.cluster.config, getReplSetName(connCache.config[0]));
+
+ // We set _isConfigServer=true on the Mongo connection object so
+ // set_read_preference_secondary.js knows to avoid overriding the read preference as the
+ // concurrency suite may be running with a 1-node CSRS.
+ connCache.rsConns.config._isConfigServer = true;
var shardNames = Object.keys(args.cluster.shards);
- shardNames.forEach(name => (connCache.shards[name] = args.cluster.shards[name].map(
- makeNewConnWithExistingSession)));
+ shardNames.forEach(name => {
+ connCache.shards[name] =
+ args.cluster.shards[name].map(makeNewConnWithExistingSession);
+ connCache.rsConns.shards[name] = makeReplSetConnWithExistingSession(
+ args.cluster.shards[name], getReplSetName(connCache.shards[name][0]));
+ });
}
for (var i = 0; i < args.iterations; ++i) {
@@ -63,8 +103,10 @@ var fsm = (function() {
let data;
withTxnAndAutoRetry(args.db.getSession(), () => {
data = TransactionsUtil.deepCopyObject({}, args.data);
+ data[kIsRunningInsideTransaction] = true;
fn.call(data, args.db, args.collName, connCache);
});
+ delete data[kIsRunningInsideTransaction];
args.data = data;
} catch (e) {
// Retry state functions that threw OperationNotSupportedInTransaction or
@@ -128,5 +170,9 @@ var fsm = (function() {
assert(false, 'not reached');
}
- return {run: runFSM, _getWeightedRandomChoice: getWeightedRandomChoice};
+ return {
+ forceRunningOutsideTransaction,
+ run: runFSM,
+ _getWeightedRandomChoice: getWeightedRandomChoice,
+ };
})();
diff --git a/jstests/concurrency/fsm_libs/worker_thread.js b/jstests/concurrency/fsm_libs/worker_thread.js
index 58eeed3e66f..7e237e6257b 100644
--- a/jstests/concurrency/fsm_libs/worker_thread.js
+++ b/jstests/concurrency/fsm_libs/worker_thread.js
@@ -205,6 +205,7 @@ var workerThread = (function() {
passConnectionCache: config.passConnectionCache,
startState: config.startState,
states: config.states,
+ tid: args.tid,
transitions: config.transitions
};
});
diff --git a/jstests/concurrency/fsm_workload_helpers/chunks.js b/jstests/concurrency/fsm_workload_helpers/chunks.js
index d4c78b3de53..cf28c7ebe4e 100644
--- a/jstests/concurrency/fsm_workload_helpers/chunks.js
+++ b/jstests/concurrency/fsm_workload_helpers/chunks.js
@@ -21,15 +21,10 @@ var ChunkHelper = (function() {
return Math.min(curSleep, MAX_BACKOFF_SLEEP);
}
- function runCommandWithRetries(db, cmd, acceptableErrorCodes) {
+ function runCommandWithRetries(db, cmd, didAcceptableErrorOccurFn) {
const INITIAL_BACKOFF_SLEEP = 500; // milliseconds
const MAX_RETRIES = 5;
- var acceptableErrorOccurred = function acceptableErrorOccurred(errorCode,
- acceptableErrorCodes) {
- return acceptableErrorCodes.indexOf(errorCode) > -1;
- };
-
var res;
var retries = 0;
var backoffSleep = INITIAL_BACKOFF_SLEEP;
@@ -41,12 +36,15 @@ var ChunkHelper = (function() {
return res;
}
// Assert command worked or acceptable error occurred.
- var msg = tojson({command: cmd, res: res});
- assertWhenOwnColl(acceptableErrorOccurred(res.code, acceptableErrorCodes), msg);
+ if (didAcceptableErrorOccurFn(res)) {
+ // When an acceptable error occurs, sleep and then retry.
+ sleep(backoffSleep);
+ backoffSleep = getNextBackoffSleep(backoffSleep);
+ continue;
+ }
- // When an acceptable error occurs, sleep and then retry.
- sleep(backoffSleep);
- backoffSleep = getNextBackoffSleep(backoffSleep);
+ // Throw an exception if the command errored for any other reason.
+ assertWhenOwnColl.commandWorked(res, cmd);
}
return res;
@@ -54,14 +52,12 @@ var ChunkHelper = (function() {
function splitChunkAtPoint(db, collName, splitPoint) {
var cmd = {split: db[collName].getFullName(), middle: {_id: splitPoint}};
- var acceptableErrorCodes = [ErrorCodes.LockBusy];
- return runCommandWithRetries(db, cmd, acceptableErrorCodes);
+ return runCommandWithRetries(db, cmd, res => res.code === ErrorCodes.LockBusy);
}
function splitChunkWithBounds(db, collName, bounds) {
var cmd = {split: db[collName].getFullName(), bounds: bounds};
- var acceptableErrorCodes = [ErrorCodes.LockBusy];
- return runCommandWithRetries(db, cmd, acceptableErrorCodes);
+ return runCommandWithRetries(db, cmd, res => res.code === ErrorCodes.LockBusy);
}
function moveChunk(db, collName, bounds, toShard, waitForDelete) {
@@ -71,15 +67,28 @@ var ChunkHelper = (function() {
to: toShard,
_waitForDelete: waitForDelete
};
- var acceptableErrorCodes =
- [ErrorCodes.ConflictingOperationInProgress, ErrorCodes.ChunkRangeCleanupPending];
- return runCommandWithRetries(db, cmd, acceptableErrorCodes);
+
+ const runningWithStepdowns =
+ TestData.runningWithConfigStepdowns || TestData.runningWithShardStepdowns;
+
+ return runCommandWithRetries(
+ db,
+ cmd,
+ res => (res.code === ErrorCodes.ConflictingOperationInProgress ||
+ res.code === ErrorCodes.ChunkRangeCleanupPending ||
+ // The chunk migration has surely been aborted if the startCommit of the
+ // procedure was interrupted by a stepdown.
+ (runningWithStepdowns && res.code === ErrorCodes.CommandFailed &&
+ res.errmsg.includes("startCommit")) ||
+ // The chunk migration has surely been aborted if the recipient shard didn't
+ // believe there was an active chunk migration.
+ (runningWithStepdowns && res.code === ErrorCodes.OperationFailed &&
+ res.errmsg.includes("NotYetInitialized"))));
}
function mergeChunks(db, collName, bounds) {
var cmd = {mergeChunks: db[collName].getFullName(), bounds: bounds};
- var acceptableErrorCodes = [ErrorCodes.LockBusy];
- return runCommandWithRetries(db, cmd, acceptableErrorCodes);
+ return runCommandWithRetries(db, cmd, res => res.code === ErrorCodes.LockBusy);
}
// Take a set of connections to a shard (replica set or standalone mongod),
@@ -141,11 +150,19 @@ var ChunkHelper = (function() {
return {shards: shards, explain: res, query: query, shardVersion: shardVersion};
}
+ function itcount(collection, query) {
+ // We project out all of the fields in order to greatly reduce the likelihood a cursor would
+ // actually be returned. This is acceptable because we're only interested in how many
+ // documents there were and not any of their contents. The network_error_and_txn_override.js
+ // override would throw an exception if we attempted to use the getMore command.
+ return collection.find(query, {_id: 0, nonExistingField: 1}).itcount();
+ }
+
// Return the number of docs in [lower, upper) as seen by conn.
function getNumDocs(conn, collName, lower, upper) {
var coll = conn.getCollection(collName);
var query = {$and: [{_id: {$gte: lower}}, {_id: {$lt: upper}}]};
- return coll.find(query).itcount();
+ return itcount(coll, query);
}
// Intended for use on config or mongos connections only.
@@ -157,7 +174,7 @@ var ChunkHelper = (function() {
assert(isString(ns) && ns.indexOf('.') !== -1 && !ns.startsWith('.') && !ns.endsWith('.'),
ns + ' is not a valid namespace');
var query = {'ns': ns, 'min._id': {$gte: lower}, 'max._id': {$lte: upper}};
- return conn.getDB('config').chunks.find(query).itcount();
+ return itcount(conn.getDB('config').chunks, query);
}
// Intended for use on config or mongos connections only.
diff --git a/jstests/concurrency/fsm_workloads/random_moveChunk_base.js b/jstests/concurrency/fsm_workloads/random_moveChunk_base.js
index 6e74c0e5458..a35c1ce5c5a 100644
--- a/jstests/concurrency/fsm_workloads/random_moveChunk_base.js
+++ b/jstests/concurrency/fsm_workloads/random_moveChunk_base.js
@@ -70,9 +70,14 @@ var $config = extendWorkload($config, function($config, $super) {
* acceptable errors, e.g. ConflictingOperationInProgress, and is not guaranteed to succeed.
*/
$config.states.moveChunk = function moveChunk(db, collName, connCache) {
+ // Committing a chunk migration requires acquiring the global X lock on the CSRS primary.
+ // This state function is unsafe to automatically run inside a multi-statement transaction
+ // because it'll have left an idle transaction on the CSRS primary before attempting to run
+ // the moveChunk command, which can lead to a hang.
+ fsm.forceRunningOutsideTransaction(this);
+
// Choose a random chunk in our partition to move.
- const chunk =
- this.getRandomChunkInPartition(collName, ChunkHelper.getPrimary(connCache.config));
+ const chunk = this.getRandomChunkInPartition(collName, connCache.rsConns.config);
const fromShard = chunk.shard;
// Choose a random shard to move the chunk to.
diff --git a/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js b/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js
index 2f1f3f4f103..92184d58df0 100644
--- a/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js
+++ b/jstests/concurrency/fsm_workloads/sharded_base_partitioned.js
@@ -139,7 +139,7 @@ var $config = (function() {
Object.freeze(this.partition);
// Verify that there is exactly 1 chunk in our partition.
- var config = ChunkHelper.getPrimary(connCache.config);
+ var config = connCache.rsConns.config;
var numChunks = ChunkHelper.getNumChunks(
config, ns, this.partition.chunkLower, this.partition.chunkUpper);
var chunks = ChunkHelper.getChunks(config, ns, MinKey, MaxKey);
diff --git a/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js b/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js
index 33dc02c405b..3717fadf7dc 100644
--- a/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js
+++ b/jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js
@@ -22,9 +22,14 @@ var $config = extendWorkload($config, function($config, $super) {
// verify that each node in the cluster affected by the moveChunk operation sees
// the appropriate after-state regardless of whether the operation succeeded or failed.
$config.states.moveChunk = function moveChunk(db, collName, connCache) {
- var dbName = db.getName();
+ // Committing a chunk migration requires acquiring the global X lock on the CSRS primary.
+ // This state function is unsafe to automatically run inside a multi-statement transaction
+ // because it'll have left an idle transaction on the CSRS primary before attempting to run
+ // the moveChunk command, which can lead to a hang.
+ fsm.forceRunningOutsideTransaction(this);
+
var ns = db[collName].getFullName();
- var config = ChunkHelper.getPrimary(connCache.config);
+ var config = connCache.rsConns.config;
// Verify that more than one shard exists in the cluster. If only one shard existed,
// there would be no way to move a chunk from one shard to another.
@@ -73,17 +78,22 @@ var $config = extendWorkload($config, function($config, $super) {
// Verify that the fromShard and toShard have the correct after-state
// (see comments below for specifics).
- var fromShardPrimary = ChunkHelper.getPrimary(connCache.shards[fromShard]);
- var toShardPrimary = ChunkHelper.getPrimary(connCache.shards[toShard]);
+ var fromShardRSConn = connCache.rsConns.shards[fromShard];
+ var toShardRSConn = connCache.rsConns.shards[toShard];
var fromShardNumDocsAfter =
- ChunkHelper.getNumDocs(fromShardPrimary, ns, chunk.min._id, chunk.max._id);
+ ChunkHelper.getNumDocs(fromShardRSConn, ns, chunk.min._id, chunk.max._id);
var toShardNumDocsAfter =
- ChunkHelper.getNumDocs(toShardPrimary, ns, chunk.min._id, chunk.max._id);
+ ChunkHelper.getNumDocs(toShardRSConn, ns, chunk.min._id, chunk.max._id);
// If the moveChunk operation succeeded, verify that the shard the chunk
// was moved to returns all data for the chunk. If waitForDelete was true,
// also verify that the shard the chunk was moved from returns no data for the chunk.
if (moveChunkRes.ok) {
- if (waitForDelete) {
+ const runningWithStepdowns =
+ TestData.runningWithConfigStepdowns || TestData.runningWithShardStepdowns;
+
+ // TODO SERVER-46669: The moveChunk command can succeed without waiting for the range
+ // deletion to complete if the replica set shard primary steps down.
+ if (waitForDelete && !runningWithStepdowns) {
msg = 'moveChunk succeeded but original shard still had documents.\n' + msgBase +
', waitForDelete: ' + waitForDelete + ', bounds: ' + tojson(bounds);
assertWhenOwnColl.eq(fromShardNumDocsAfter, 0, msg);
@@ -102,35 +112,27 @@ var $config = extendWorkload($config, function($config, $super) {
}
// Verify that all config servers have the correct after-state.
- // (see comments below for specifics).
- for (var conn of connCache.config) {
- var res = conn.adminCommand({isMaster: 1});
- assertAlways.commandWorked(res);
- if (res.ismaster) {
- // If the moveChunk operation succeeded, verify that the config updated the chunk's
- // shard with the toShard. If the operation failed, verify that the config kept
- // the chunk's shard as the fromShard.
- var chunkAfter = conn.getDB('config').chunks.findOne({_id: chunk._id});
- var msg = msgBase + '\nchunkBefore: ' + tojson(chunk) +
- '\nchunkAfter: ' + tojson(chunkAfter);
- if (moveChunkRes.ok) {
- msg = "moveChunk succeeded but chunk's shard was not new shard.\n" + msg;
- assertWhenOwnColl.eq(chunkAfter.shard, toShard, msg);
- } else {
- msg = "moveChunk failed but chunk's shard was not original shard.\n" + msg;
- assertWhenOwnColl.eq(chunkAfter.shard, fromShard, msg);
- }
-
- // Regardless of whether the operation succeeded or failed,
- // verify that the number of chunks in our partition stayed the same.
- var numChunksAfter = ChunkHelper.getNumChunks(
- conn, ns, this.partition.chunkLower, this.partition.chunkUpper);
- msg = 'Number of chunks in partition seen by config changed with moveChunk.\n' +
- msgBase;
- assertWhenOwnColl.eq(numChunksBefore, numChunksAfter, msg);
- }
+ // If the moveChunk operation succeeded, verify that the config updated the chunk's shard
+ // with the toShard. If the operation failed, verify that the config kept the chunk's shard
+ // as the fromShard.
+ var chunkAfter = config.getDB('config').chunks.findOne({_id: chunk._id});
+ var msg =
+ msgBase + '\nchunkBefore: ' + tojson(chunk) + '\nchunkAfter: ' + tojson(chunkAfter);
+ if (moveChunkRes.ok) {
+ msg = "moveChunk succeeded but chunk's shard was not new shard.\n" + msg;
+ assertWhenOwnColl.eq(chunkAfter.shard, toShard, msg);
+ } else {
+ msg = "moveChunk failed but chunk's shard was not original shard.\n" + msg;
+ assertWhenOwnColl.eq(chunkAfter.shard, fromShard, msg);
}
+ // Regardless of whether the operation succeeded or failed, verify that the number of chunks
+ // in our partition stayed the same.
+ var numChunksAfter = ChunkHelper.getNumChunks(
+ config, ns, this.partition.chunkLower, this.partition.chunkUpper);
+ msg = 'Number of chunks in partition seen by config changed with moveChunk.\n' + msgBase;
+ assertWhenOwnColl.eq(numChunksBefore, numChunksAfter, msg);
+
// Verify that all mongos processes see the correct after-state on the shards and configs.
// (see comments below for specifics).
for (var mongos of connCache.mongos) {