summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDidier Nadeau <didier.nadeau@mongodb.com>2022-04-12 13:54:25 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-12 21:18:25 +0000
commitcff8825855ae567667d3834861433f8f70303b84 (patch)
tree403c938079a4e3cdc1ac5ffadbc7f4caffae2b04
parentd8934508d139a3ed7d1fbd8d96971019166f2944 (diff)
downloadmongo-cff8825855ae567667d3834861433f8f70303b84.tar.gz
SERVER-63092 Improve split test library and add tests for write
-rw-r--r--jstests/serverless/libs/basic_serverless_test.js178
-rw-r--r--jstests/serverless/libs/shard_split_write_test.js43
-rw-r--r--jstests/serverless/shard_split_basic_test.js20
-rw-r--r--jstests/serverless/shard_split_reconfig.js21
-rw-r--r--jstests/serverless/shard_split_rejects_multiple_ops.js5
-rw-r--r--jstests/serverless/shard_split_startup_recovery_aborted.js2
-rw-r--r--jstests/serverless/shard_split_tenant_access_blocking.js25
-rw-r--r--jstests/serverless/shard_split_wait_for_block_timestamp.js53
-rw-r--r--jstests/serverless/shard_split_write_during_aborted_split.js69
-rw-r--r--jstests/serverless/shard_split_write_during_shard_split.js68
-rw-r--r--jstests/serverless/shard_split_write_during_split_stepdown.js72
-rw-r--r--src/mongo/db/serverless/shard_split_commands.cpp8
-rw-r--r--src/mongo/db/serverless/shard_split_donor_service.cpp11
13 files changed, 474 insertions, 101 deletions
diff --git a/jstests/serverless/libs/basic_serverless_test.js b/jstests/serverless/libs/basic_serverless_test.js
index 67bb9093b0c..e9b56495733 100644
--- a/jstests/serverless/libs/basic_serverless_test.js
+++ b/jstests/serverless/libs/basic_serverless_test.js
@@ -1,3 +1,139 @@
+load("jstests/replsets/rslib.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/uuid_util.js");
+
+const runCommitShardSplitAsync = function(
+ rstArgs, migrationIdString, tenantIds, recipientTagName, recipientSetName) {
+ load("jstests/replsets/rslib.js");
+
+ const donorRst = createRst(rstArgs, true);
+ const admin = donorRst.getPrimary().getDB("admin");
+
+ return admin.runCommand({
+ commitShardSplit: 1,
+ migrationId: UUID(migrationIdString),
+ tenantIds,
+ recipientTagName,
+ recipientSetName
+ });
+};
+
+const runShardSplitCommand = function(replicaSet, cmdObj, retryOnRetryableErrors) {
+ const primary = replicaSet.getPrimary();
+
+ let res;
+ assert.soon(() => {
+ try {
+ res = primary.adminCommand(cmdObj);
+
+ if (!res.ok) {
+ // If retry is enabled and the command failed with a NotPrimary error, continue
+ // looping.
+ const cmdName = Object.keys(cmdObj)[0];
+ if (retryOnRetryableErrors && isRetryableError(res.code)) {
+ jsTestLog(`runShardSplitCommand retryable error. Command: ${
+ tojson(cmdObj)}, reply: ${tojson(res)}`);
+ primary = replicaSet.getPrimary();
+ return false;
+ }
+ jsTestLog(`runShardSplitCommand fatal error. Command: ${tojson(cmdObj)}, reply: ${
+ tojson(res)}`);
+ return true;
+ }
+ return true;
+ } catch (e) {
+ if (retryOnRetryableErrors && isRetryableError(e)) {
+ jsTestLog(`runShardSplitCommand retryable error. Command: ${
+ tojson(cmdObj)}, reply: ${tojson(res)}`);
+ return false;
+ }
+ jsTestLog(`runShardSplitCommand fatal error. Command: ${tojson(cmdObj)}, reply: ${
+ tojson(res)}`);
+ throw e;
+ }
+ });
+ return res;
+};
+
+/**
+ * Utility class to run shard split operations.
+ */
+class ShardSplitOperation {
+ constructor(donorSet, recipientSetName, recipientTagName, tenantIds, migrationId) {
+ this.donorSet = donorSet;
+ this.recipientTagName = recipientTagName;
+ this.recipientSetName = recipientSetName;
+ this.tenantIds = tenantIds;
+ this.migrationId = migrationId;
+ }
+
+ /**
+ * Starts a shard split synchronously.
+ */
+ commit({retryOnRetryableErrors} = {retryOnRetryableErrors: true}) {
+ jsTestLog("Running commit command");
+
+ const primary = this.donorSet.getPrimary();
+
+ const localCmdObj = {
+ commitShardSplit: 1,
+ migrationId: this.migrationId,
+ tenantIds: this.tenantIds,
+ recipientTagName: this.recipientTagName,
+ recipientSetName: this.recipientSetName
+ };
+
+ return runShardSplitCommand(this.donorSet, localCmdObj, retryOnRetryableErrors);
+ }
+
+ /**
+ * Starts a shard split asynchronously and returns the Thread that runs it.
+ */
+ commitAsync() {
+ jsTestLog("Running commitAsync command");
+
+ const donorRst = createRstArgs(this.donorSet);
+ const migrationIdString = extractUUIDFromObject(this.migrationId);
+
+ const thread = new Thread(runCommitShardSplitAsync,
+ donorRst,
+ migrationIdString,
+ this.tenantIds,
+ this.recipientTagName,
+ this.recipientSetName);
+ thread.start();
+
+ return thread;
+ }
+
+ /**
+ * Forgets a shard split synchronously.
+ */
+ forget() {
+ jsTestLog("Running forgetShardSplit command");
+
+ const cmdObj = {forgetShardSplit: 1, migrationId: this.migrationId};
+
+ assert.commandWorked(
+ runShardSplitCommand(this.donorSet, cmdObj, true /* retryableOnErrors */));
+ }
+
+ /**
+ * Aborts a shard split synchronously.
+ */
+ abort() {
+ jsTestLog("Running abort command");
+
+ const admin = this.donorSet.getPrimary().getDB("admin");
+
+ return admin.runCommand({abortShardSplit: 1, migrationId: this.migrationId});
+ }
+}
+
+/**
+ * Utility class to create a ReplicaSetTest that provides functionnality to run a shard split
+ * operation.
+ */
class BasicServerlessTest {
constructor({recipientTagName, recipientSetName, quickGarbageCollection = false, nodeOptions}) {
nodeOptions = nodeOptions || {};
@@ -29,6 +165,19 @@ class BasicServerlessTest {
}
/*
+ * Returns a ShardSplitOperation object to run a shard split.
+ * @param {tenantIds} tells which tenant ids to run a split for.
+ */
+ createSplitOperation(tenantIds) {
+ const migrationId = UUID();
+ jsTestLog("Asserting no state document exist before command");
+ assert.isnull(findMigration(this.donor.getPrimary(), migrationId));
+
+ return new ShardSplitOperation(
+ this.donor, this.recipientSetName, this.recipientTagName, tenantIds, migrationId);
+ }
+
+ /*
* Add recipient nodes to the current donor set.
* @param {numNodes} indicates the number of recipient nodes to be added.
*/
@@ -114,6 +263,16 @@ class BasicServerlessTest {
}
/*
+ * Takes an rstArgs produced by createArgs and remove the recipient nodes from it.
+ */
+ removeRecipientsFromRstArgs(rstArgs) {
+ rstArgs.nodeHosts = rstArgs.nodeHosts.filter(nodeString => {
+ const port = parseInt(nodeString.split(":")[1]);
+ return !this.recipientNodes.some(node => node.port == port);
+ });
+ }
+
+ /*
* Wait for state document garbage collection by polling for when the document has been removed
* from the tenantSplitDonors namespace, and all access blockers have been removed.
* @param {migrationId} id that was used for the commitShardSplit command.
@@ -150,13 +309,6 @@ class BasicServerlessTest {
}));
}
- forgetShardSplit(migrationId) {
- jsTestLog("Running forgetShardSplit command");
- const cmdObj = {forgetShardSplit: 1, migrationId: migrationId};
- const primary = this.donor.getPrimary();
- return assert.commandWorked(primary.getDB('admin').runCommand(cmdObj));
- }
-
removeRecipientNodesFromDonor() {
jsTestLog("Removing recipient nodes from the donor.");
this.donor.nodes = this.donor.nodes.filter(node => !this.recipientNodes.includes(node));
@@ -200,18 +352,18 @@ class BasicServerlessTest {
BasicServerlessTest.kConfigSplitDonorsNS = "config.tenantSplitDonors";
-function findMigration(primary, uuid) {
+function findMigration(primary, migrationId) {
const donorsCollection = primary.getCollection(BasicServerlessTest.kConfigSplitDonorsNS);
- return donorsCollection.findOne({"_id": uuid});
+ return donorsCollection.findOne({"_id": migrationId});
}
-function cleanupMigrationDocument(primary, uuid) {
+function cleanupMigrationDocument(primary, migrationId) {
const donorsCollection = primary.getCollection(BasicServerlessTest.kConfigSplitDonorsNS);
- return donorsCollection.deleteOne({"_id": uuid}, {w: "majority"});
+ return donorsCollection.deleteOne({"_id": migrationId}, {w: "majority"});
}
-function assertMigrationState(primary, uuid, state) {
- const migrationDoc = findMigration(primary, uuid);
+function assertMigrationState(primary, migrationId, state) {
+ const migrationDoc = findMigration(primary, migrationId);
assert(migrationDoc);
if (migrationDoc.state === 'aborted') {
diff --git a/jstests/serverless/libs/shard_split_write_test.js b/jstests/serverless/libs/shard_split_write_test.js
new file mode 100644
index 00000000000..eb3685eb298
--- /dev/null
+++ b/jstests/serverless/libs/shard_split_write_test.js
@@ -0,0 +1,43 @@
+/**
+ *
+ * Tests that runs a shard split to completion and tries to write before and during the split.
+ * @tags: [requires_fcv_52, featureFlagShardSplit]
+ */
+
+load("jstests/libs/fail_point_util.js"); // for "configureFailPoint"
+load('jstests/libs/parallel_shell_helpers.js'); // for "startParallelShell"
+load("jstests/serverless/libs/basic_serverless_test.js");
+
+/*
+ * Connects to a replica set and runs write operation, returning the results.
+ * @param {rstArgs} replicaSetArgs for the replica set to connect to.
+ * @param {tenantIds} perform a write operation for each tenantId.
+ */
+function doWriteOperations(rstArgs, tenantIds) {
+ load("jstests/replsets/rslib.js");
+
+ const donorRst = createRst(rstArgs, true);
+ const donorPrimary = donorRst.getPrimary();
+
+ const writeResults = [];
+
+ tenantIds.forEach(id => {
+ const kDbName = `${id}_testDb`;
+ const kCollName = "testColl";
+ const kNs = `${kDbName}.${kCollName}`;
+
+ const res = donorPrimary.getDB(kDbName)
+ .getCollection(kNs)
+ .insert([{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}],
+ {writeConcern: {w: "majority"}})
+ .getRawResponse();
+ if (res.writeErrors.length > 0) {
+ writeResults.push(res.writeErrors[0].code);
+ } else {
+ // Push OK
+ writeResults.push(0);
+ }
+ });
+
+ return writeResults;
+}
diff --git a/jstests/serverless/shard_split_basic_test.js b/jstests/serverless/shard_split_basic_test.js
index ad22026c10e..dbf72f974ef 100644
--- a/jstests/serverless/shard_split_basic_test.js
+++ b/jstests/serverless/shard_split_basic_test.js
@@ -13,31 +13,27 @@ load("jstests/serverless/libs/basic_serverless_test.js");
const recipientTagName = "recipientNode";
const recipientSetName = "recipientSetName";
+const tenantIds = ["tenant1", "tenant2"];
+
const test =
new BasicServerlessTest({recipientTagName, recipientSetName, quickGarbageCollection: true});
test.addRecipientNodes();
test.donor.awaitSecondaryNodes();
-const migrationId = UUID();
+const operation = test.createSplitOperation(tenantIds);
jsTestLog("Running the commitShardSplit operation");
-const admin = test.donor.getPrimary().getDB("admin");
-const tenantIds = ["tenant1", "tenant2"];
-assert.soon(() => {
- const result = admin.runCommand(
- {commitShardSplit: 1, migrationId, tenantIds, recipientTagName, recipientSetName});
- assert.commandWorked(result);
- return result.state === 'committed';
-});
+assert.commandWorked(operation.commit());
test.removeRecipientNodesFromDonor();
// getPrimary can only be called once recipient nodes have been remove from test.
-assertMigrationState(test.donor.getPrimary(), migrationId, "committed");
+assertMigrationState(test.donor.getPrimary(), operation.migrationId, "committed");
+
+operation.forget();
-test.forgetShardSplit(migrationId);
+test.waitForGarbageCollection(operation.migrationId, tenantIds);
-test.waitForGarbageCollection(migrationId, tenantIds);
test.stop();
})();
diff --git a/jstests/serverless/shard_split_reconfig.js b/jstests/serverless/shard_split_reconfig.js
index b0177a6f06b..06d1a31fd6a 100644
--- a/jstests/serverless/shard_split_reconfig.js
+++ b/jstests/serverless/shard_split_reconfig.js
@@ -15,6 +15,7 @@ function shardSplitApplySplitConfig() {
// Skip db hash check because secondary is left with a different config.
TestData.skipCheckDBHashes = true;
+ const tenantIds = ["tenant1", "tenant2"];
const test = new BasicServerlessTest({
recipientTagName: "recipientNode",
@@ -24,24 +25,14 @@ function shardSplitApplySplitConfig() {
test.addRecipientNodes();
const donorPrimary = test.donor.getPrimary();
- const migrationId = UUID();
- const tenantIds = ["tenant1", "tenant2"];
- jsTestLog("Asserting no state document exist before command");
- assert.isnull(findMigration(donorPrimary, migrationId));
+ const operation = test.createSplitOperation(tenantIds);
jsTestLog("Running commitShardSplit command");
- const adminDb = donorPrimary.getDB("admin");
- assert.commandWorked(adminDb.runCommand({
- commitShardSplit: 1,
- migrationId,
- recipientTagName: test.recipientTagName,
- recipientSetName: test.recipientSetName,
- tenantIds
- }));
+ assert.commandWorked(operation.commit());
jsTestLog("Asserting state document exist after command");
- assertMigrationState(donorPrimary, migrationId, "committed");
+ assertMigrationState(donorPrimary, operation.migrationId, "committed");
test.removeRecipientNodesFromDonor();
@@ -53,9 +44,9 @@ function shardSplitApplySplitConfig() {
assert.eq(configDoc["recipientConfig"]["_id"], "recipient");
assert.eq(configDoc["recipientConfig"]["members"].length, 3);
- test.forgetShardSplit(migrationId);
+ operation.forget();
- test.waitForGarbageCollection(migrationId, tenantIds);
+ test.waitForGarbageCollection(operation.migrationId, tenantIds);
test.stop();
}
diff --git a/jstests/serverless/shard_split_rejects_multiple_ops.js b/jstests/serverless/shard_split_rejects_multiple_ops.js
index da91999f823..35099bf0633 100644
--- a/jstests/serverless/shard_split_rejects_multiple_ops.js
+++ b/jstests/serverless/shard_split_rejects_multiple_ops.js
@@ -70,7 +70,8 @@ function commitShardSplitConcurrently() {
117); // ConflictingOperationInProgress
test.removeRecipientNodesFromDonor();
- test.forgetShardSplit(migrationId);
+ assert.commandWorked(
+ donorPrimary.adminCommand({forgetShardSplit: 1, migrationId: migrationId}));
// fails because the commitShardSplit hasn't be garbage collected yet.
assert.commandFailedWithCode(donorPrimary.adminCommand({
@@ -111,7 +112,7 @@ function commitShardSplitAfterAbort() {
const admin = test.donor.getPrimary().getDB("admin");
let fp = configureFailPoint(admin, "pauseShardSplitAfterBlocking");
- assert.commandWorked(admin.runCommand(
+ assert.commandFailed(admin.runCommand(
{commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds}));
fp.wait();
diff --git a/jstests/serverless/shard_split_startup_recovery_aborted.js b/jstests/serverless/shard_split_startup_recovery_aborted.js
index dceca6a7f5b..56016dc4ccf 100644
--- a/jstests/serverless/shard_split_startup_recovery_aborted.js
+++ b/jstests/serverless/shard_split_startup_recovery_aborted.js
@@ -41,7 +41,7 @@ let fp = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterDe
const tenantIds = ["tenant5", "tenant6"];
-assert.commandWorked(donorPrimary.adminCommand(
+assert.commandFailed(donorPrimary.adminCommand(
{commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds}));
fp.wait();
diff --git a/jstests/serverless/shard_split_tenant_access_blocking.js b/jstests/serverless/shard_split_tenant_access_blocking.js
index 814a8c6818f..5303deeaf75 100644
--- a/jstests/serverless/shard_split_tenant_access_blocking.js
+++ b/jstests/serverless/shard_split_tenant_access_blocking.js
@@ -25,11 +25,12 @@ const test = new BasicServerlessTest({
test.addRecipientNodes();
const donorPrimary = test.donor.getPrimary();
-const migrationId = UUID();
+const maxTimeMS = 1 * 2000; // 2 seconds
const tenantIds = ["tenant1", "tenant2"];
jsTestLog("Asserting no state document exist before command");
-assert.isnull(findMigration(donorPrimary, migrationId));
+const operation = test.createSplitOperation(tenantIds);
+assert.isnull(findMigration(donorPrimary, operation.migrationId));
jsTestLog("Asserting we can write before the migration");
tenantIds.forEach(id => {
@@ -44,16 +45,12 @@ const adminDb = donorPrimary.getDB("admin");
const blockingFailPoint = configureFailPoint(adminDb, "pauseShardSplitAfterBlocking");
jsTestLog("Running commitShardSplit command");
-const awaitCommand = startParallelShell(
- funWithArgs(function(migrationId, recipientTagName, recipientSetName, tenantIds) {
- assert.commandWorked(db.adminCommand(
- {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds}));
- }, migrationId, test.recipientTagName, test.recipientSetName, tenantIds), donorPrimary.port);
+const runThread = operation.commitAsync();
blockingFailPoint.wait();
jsTestLog("Asserting state document is in blocking state");
-assertMigrationState(donorPrimary, migrationId, "blocking");
+assertMigrationState(donorPrimary, operation.migrationId, "blocking");
jsTestLog("Asserting we cannot write in blocking state");
let writeThreads = [];
@@ -82,16 +79,20 @@ assert.soon(function() {
jsTestLog("Disabling failpoints and waiting for command to complete");
blockingFailPoint.off();
-awaitCommand();
+
+runThread.join();
+const data = runThread.returnData();
+assert(data["ok"] == 1);
writeThreads.forEach(thread => thread.join());
jsTestLog("Asserting state document exist after command");
-assertMigrationState(donorPrimary, migrationId, "committed");
+assertMigrationState(donorPrimary, operation.migrationId, "committed");
test.removeRecipientNodesFromDonor();
-test.forgetShardSplit(migrationId);
-test.waitForGarbageCollection(migrationId, tenantIds);
+operation.forget();
+
+test.waitForGarbageCollection(operation.migrationId, tenantIds);
test.stop();
})();
diff --git a/jstests/serverless/shard_split_wait_for_block_timestamp.js b/jstests/serverless/shard_split_wait_for_block_timestamp.js
index 09a86891593..9399b46f909 100644
--- a/jstests/serverless/shard_split_wait_for_block_timestamp.js
+++ b/jstests/serverless/shard_split_wait_for_block_timestamp.js
@@ -41,27 +41,14 @@ for (let i = 0; i < 2000; i++) {
assert.commandWorked(bulk.execute());
jsTestLog("Running commitShardSplit command");
-const firstOperationId = UUID();
-assert.isnull(findMigration(donorPrimary, firstOperationId));
-const awaitFirstSplitOperation = startParallelShell(
- funWithArgs(
- function(migrationId, recipientTagName, recipientSetName, tenantIds) {
- assert.commandWorked(db.adminCommand(
- {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds}));
- },
- firstOperationId,
- test.recipientTagName,
- test.recipientSetName,
- tenantIds),
- donorPrimary.port);
+const firstOperation = test.createSplitOperation(tenantIds);
+assert.isnull(findMigration(donorPrimary, firstOperation.migrationId));
+const res = firstOperation.commit({retryOnRetryableErrors: false});
+assert.commandFailed(res);
+assert.eq(res.code, ErrorCodes.TenantMigrationAborted);
-awaitFirstSplitOperation();
-assertMigrationState(donorPrimary, firstOperationId, "aborted");
-
-jsTestLog(`Running forgetShardSplit command: ${tojson(firstOperationId)}`);
-test.forgetShardSplit(firstOperationId);
-
-test.waitForGarbageCollection(firstOperationId, tenantIds);
+firstOperation.forget();
+test.waitForGarbageCollection(firstOperation.migrationId, tenantIds);
jsTestLog("Restarting replication on recipient nodes, and running new split operation");
test.recipientNodes.forEach(node => restartServerReplication(node));
@@ -69,28 +56,14 @@ test.donor.awaitReplication();
test.donor.nodes.forEach(
node => assert.commandWorked(setParameter(node, "shardSplitTimeoutMS", 60 * 1000)));
-const secondOperationId = UUID();
-assert.isnull(findMigration(donorPrimary, secondOperationId));
-const awaitSecondSplitOperation = startParallelShell(
- funWithArgs(
- function(migrationId, recipientTagName, recipientSetName, tenantIds) {
- assert.commandWorked(db.adminCommand(
- {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds}));
- },
- secondOperationId,
- test.recipientTagName,
- test.recipientSetName,
- tenantIds),
- donorPrimary.port);
-
-awaitSecondSplitOperation();
-assertMigrationState(donorPrimary, secondOperationId, "committed");
+const secondOperation = test.createSplitOperation(tenantIds);
+assert.isnull(findMigration(donorPrimary, secondOperation.migrationId));
+assert.commandWorked(secondOperation.commit());
+assertMigrationState(donorPrimary, secondOperation.migrationId, "committed");
test.removeRecipientNodesFromDonor();
+secondOperation.forget();
-jsTestLog(`Running forgetShardSplit command: ${tojson(secondOperationId)}`);
-test.forgetShardSplit(secondOperationId);
-
-test.waitForGarbageCollection(secondOperationId, tenantIds);
+test.waitForGarbageCollection(secondOperation.migrationId, tenantIds);
test.stop();
})();
diff --git a/jstests/serverless/shard_split_write_during_aborted_split.js b/jstests/serverless/shard_split_write_during_aborted_split.js
new file mode 100644
index 00000000000..5922954ab88
--- /dev/null
+++ b/jstests/serverless/shard_split_write_during_aborted_split.js
@@ -0,0 +1,69 @@
+/**
+ *
+ * Test that starts a shard split and abort it while doing a write.
+ * @tags: [requires_fcv_52, featureFlagShardSplit]
+ */
+
+load("jstests/serverless/libs/shard_split_write_test.js");
+
+(function() {
+"use strict";
+
+const recipientTagName = "recipientNode";
+const recipientSetName = "recipientSetName";
+const test = new BasicServerlessTest({
+ recipientTagName,
+ recipientSetName,
+ nodeOptions: {
+ // Set a short timeout to test that the operation times out waiting for replication
+ setParameter: "shardSplitTimeoutMS=100000"
+ }
+});
+
+test.addRecipientNodes();
+test.donor.awaitSecondaryNodes();
+
+const donorPrimary = test.donor.getPrimary();
+const tenantIds = ["tenant1", "tenant2"];
+
+jsTestLog("Writing data before split");
+tenantIds.forEach(id => {
+ const kDbName = test.tenantDB(id, "testDb");
+ const kCollName = "testColl";
+ const kNs = `${kDbName}.${kCollName}`;
+
+ assert.commandWorked(donorPrimary.getCollection(kNs).insert(
+ [{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], {writeConcern: {w: "majority"}}));
+});
+
+const operation = test.createSplitOperation(tenantIds);
+
+const blockingFP = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterBlocking");
+
+const splitThread = operation.commitAsync();
+
+blockingFP.wait();
+
+const donorRst = createRstArgs(test.donor);
+test.removeRecipientsFromRstArgs(donorRst);
+const writeThread = new Thread(doWriteOperations, donorRst, tenantIds);
+writeThread.start();
+
+operation.abort();
+
+blockingFP.off();
+
+splitThread.join();
+const result = splitThread.returnData();
+assert.commandFailed(result);
+
+writeThread.join();
+const writeResults = writeThread.returnData();
+writeResults.forEach(res => {
+ assert.eq(res, ErrorCodes.OK);
+});
+
+TestData.skipCheckDBHashes = true;
+
+test.stop();
+})();
diff --git a/jstests/serverless/shard_split_write_during_shard_split.js b/jstests/serverless/shard_split_write_during_shard_split.js
new file mode 100644
index 00000000000..193c13dcdd5
--- /dev/null
+++ b/jstests/serverless/shard_split_write_during_shard_split.js
@@ -0,0 +1,68 @@
+/**
+ *
+ * Tests that runs a shard split to completion and tries to write before and during the split.
+ * @tags: [requires_fcv_52, featureFlagShardSplit]
+ */
+
+load("jstests/serverless/libs/shard_split_write_test.js");
+
+(function() {
+"use strict";
+
+const recipientTagName = "recipientNode";
+const recipientSetName = "recipientSetName";
+const test = new BasicServerlessTest({
+ recipientTagName,
+ recipientSetName,
+ nodeOptions: {
+ // Set a short timeout to test that the operation times out waiting for replication
+ setParameter: "shardSplitTimeoutMS=100000"
+ }
+});
+
+test.addRecipientNodes();
+test.donor.awaitSecondaryNodes();
+
+const donorPrimary = test.donor.getPrimary();
+const tenantIds = ["tenant1", "tenant2"];
+
+jsTestLog("Writing data before split");
+tenantIds.forEach(id => {
+ const kDbName = test.tenantDB(id, "testDb");
+ const kCollName = "testColl";
+ const kNs = `${kDbName}.${kCollName}`;
+
+ assert.commandWorked(donorPrimary.getCollection(kNs).insert(
+ [{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], {writeConcern: {w: "majority"}}));
+});
+
+const operation = test.createSplitOperation(tenantIds);
+
+const blockingFP = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterBlocking");
+
+const splitThread = operation.commitAsync();
+
+blockingFP.wait();
+
+const donorRst = createRstArgs(test.donor);
+test.removeRecipientsFromRstArgs(donorRst);
+const writeThread = new Thread(doWriteOperations, donorRst, tenantIds);
+writeThread.start();
+
+blockingFP.off();
+
+splitThread.join();
+const result = splitThread.returnData();
+assert.eq(result.ok, 1);
+assert.eq(result.state, "committed");
+
+writeThread.join();
+const writeResults = writeThread.returnData();
+writeResults.forEach(res => {
+ assert.eq(res, ErrorCodes.TenantMigrationCommitted);
+});
+
+TestData.skipCheckDBHashes = true;
+
+test.stop();
+})();
diff --git a/jstests/serverless/shard_split_write_during_split_stepdown.js b/jstests/serverless/shard_split_write_during_split_stepdown.js
new file mode 100644
index 00000000000..d147f09412b
--- /dev/null
+++ b/jstests/serverless/shard_split_write_during_split_stepdown.js
@@ -0,0 +1,72 @@
+/**
+ *
+ * Tests that runs a shard split, a stepdown and writes operation simultaneously to verify the
+ * commands return the expected errors and success.
+ * result of write operations.
+ * @tags: [requires_fcv_52, featureFlagShardSplit]
+ */
+
+load("jstests/serverless/libs/shard_split_write_test.js");
+
+(function() {
+"use strict";
+
+const recipientTagName = "recipientNode";
+const recipientSetName = "recipientSetName";
+const test = new BasicServerlessTest({
+ recipientTagName,
+ recipientSetName,
+ nodeOptions: {
+ // Set a short timeout to test that the operation times out waiting for replication
+ setParameter: "shardSplitTimeoutMS=100000"
+ }
+});
+
+test.addRecipientNodes();
+test.donor.awaitSecondaryNodes();
+
+const donorPrimary = test.donor.getPrimary();
+const tenantIds = ["tenant1", "tenant2"];
+
+jsTestLog("Writing data before split");
+tenantIds.forEach(id => {
+ const kDbName = test.tenantDB(id, "testDb");
+ const kCollName = "testColl";
+ const kNs = `${kDbName}.${kCollName}`;
+
+ assert.commandWorked(donorPrimary.getCollection(kNs).insert(
+ [{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], {writeConcern: {w: "majority"}}));
+});
+
+const operation = test.createSplitOperation(tenantIds);
+
+const blockingFP = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterBlocking");
+
+const splitThread = operation.commitAsync();
+
+blockingFP.wait();
+
+const donorRst = createRstArgs(test.donor);
+test.removeRecipientsFromRstArgs(donorRst);
+const writeThread = new Thread(doWriteOperations, donorRst, tenantIds);
+writeThread.start();
+
+assert.commandWorked(donorPrimary.adminCommand({replSetStepDown: 360, force: true}));
+
+blockingFP.off();
+
+splitThread.join();
+const result = splitThread.returnData();
+assert.eq(result.ok, 0);
+assert.eq(result.code, ErrorCodes.InterruptedDueToReplStateChange);
+
+writeThread.join();
+const writeResults = writeThread.returnData();
+writeResults.forEach(res => {
+ assert.eq(res, ErrorCodes.TenantMigrationCommitted);
+});
+
+TestData.skipCheckDBHashes = true;
+
+test.stop();
+})();
diff --git a/src/mongo/db/serverless/shard_split_commands.cpp b/src/mongo/db/serverless/shard_split_commands.cpp
index 27b010d4ee9..caf3a44cfde 100644
--- a/src/mongo/db/serverless/shard_split_commands.cpp
+++ b/src/mongo/db/serverless/shard_split_commands.cpp
@@ -77,9 +77,15 @@ public:
auto state = donorPtr->decisionFuture().get(opCtx);
- auto response = Response(state.state);
+ uassert(ErrorCodes::TenantMigrationAborted,
+ "The shard split operation was aborted. " +
+ (state.abortReason ? state.abortReason->toString() : ""),
+ state.state != ShardSplitDonorStateEnum::kAborted);
+
+ Response response(state.state);
if (state.abortReason) {
BSONObjBuilder bob;
+
state.abortReason->serializeErrorToBSON(&bob);
response.setAbortReason(bob.obj());
}
diff --git a/src/mongo/db/serverless/shard_split_donor_service.cpp b/src/mongo/db/serverless/shard_split_donor_service.cpp
index 5ff291c1470..633fc00e974 100644
--- a/src/mongo/db/serverless/shard_split_donor_service.cpp
+++ b/src/mongo/db/serverless/shard_split_donor_service.cpp
@@ -811,11 +811,12 @@ ShardSplitDonorService::DonorStateMachine::_handleErrorOrEnterAbortedState(
}
}
- if (primaryToken.isCanceled()) {
- return ExecutorFuture(
- **executor,
- StatusWith<DurableState>(ErrorCodes::InterruptedDueToReplStateChange,
- "Interrupted due to replica set state change"));
+ const auto status = statusWithState.getStatus();
+ if (ErrorCodes::isNotPrimaryError(status) || ErrorCodes::isShutdownError(status)) {
+ // Don't abort the split on retriable errors that may have been generated by the local
+ // server shutting/stepping down because it can be resumed when the client retries.
+
+ return ExecutorFuture(**executor, statusWithState);
}
// There is no use to check the parent token the executor would not run if the parent token