diff options
author | Didier Nadeau <didier.nadeau@mongodb.com> | 2022-04-12 13:54:25 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-04-12 21:18:25 +0000 |
commit | cff8825855ae567667d3834861433f8f70303b84 (patch) | |
tree | 403c938079a4e3cdc1ac5ffadbc7f4caffae2b04 | |
parent | d8934508d139a3ed7d1fbd8d96971019166f2944 (diff) | |
download | mongo-cff8825855ae567667d3834861433f8f70303b84.tar.gz |
SERVER-63092 Improve split test library and add tests for write
13 files changed, 474 insertions, 101 deletions
diff --git a/jstests/serverless/libs/basic_serverless_test.js b/jstests/serverless/libs/basic_serverless_test.js index 67bb9093b0c..e9b56495733 100644 --- a/jstests/serverless/libs/basic_serverless_test.js +++ b/jstests/serverless/libs/basic_serverless_test.js @@ -1,3 +1,139 @@ +load("jstests/replsets/rslib.js"); +load("jstests/libs/parallelTester.js"); +load("jstests/libs/uuid_util.js"); + +const runCommitShardSplitAsync = function( + rstArgs, migrationIdString, tenantIds, recipientTagName, recipientSetName) { + load("jstests/replsets/rslib.js"); + + const donorRst = createRst(rstArgs, true); + const admin = donorRst.getPrimary().getDB("admin"); + + return admin.runCommand({ + commitShardSplit: 1, + migrationId: UUID(migrationIdString), + tenantIds, + recipientTagName, + recipientSetName + }); +}; + +const runShardSplitCommand = function(replicaSet, cmdObj, retryOnRetryableErrors) { + const primary = replicaSet.getPrimary(); + + let res; + assert.soon(() => { + try { + res = primary.adminCommand(cmdObj); + + if (!res.ok) { + // If retry is enabled and the command failed with a NotPrimary error, continue + // looping. + const cmdName = Object.keys(cmdObj)[0]; + if (retryOnRetryableErrors && isRetryableError(res.code)) { + jsTestLog(`runShardSplitCommand retryable error. Command: ${ + tojson(cmdObj)}, reply: ${tojson(res)}`); + primary = replicaSet.getPrimary(); + return false; + } + jsTestLog(`runShardSplitCommand fatal error. Command: ${tojson(cmdObj)}, reply: ${ + tojson(res)}`); + return true; + } + return true; + } catch (e) { + if (retryOnRetryableErrors && isRetryableError(e)) { + jsTestLog(`runShardSplitCommand retryable error. Command: ${ + tojson(cmdObj)}, reply: ${tojson(res)}`); + return false; + } + jsTestLog(`runShardSplitCommand fatal error. Command: ${tojson(cmdObj)}, reply: ${ + tojson(res)}`); + throw e; + } + }); + return res; +}; + +/** + * Utility class to run shard split operations. + */ +class ShardSplitOperation { + constructor(donorSet, recipientSetName, recipientTagName, tenantIds, migrationId) { + this.donorSet = donorSet; + this.recipientTagName = recipientTagName; + this.recipientSetName = recipientSetName; + this.tenantIds = tenantIds; + this.migrationId = migrationId; + } + + /** + * Starts a shard split synchronously. + */ + commit({retryOnRetryableErrors} = {retryOnRetryableErrors: true}) { + jsTestLog("Running commit command"); + + const primary = this.donorSet.getPrimary(); + + const localCmdObj = { + commitShardSplit: 1, + migrationId: this.migrationId, + tenantIds: this.tenantIds, + recipientTagName: this.recipientTagName, + recipientSetName: this.recipientSetName + }; + + return runShardSplitCommand(this.donorSet, localCmdObj, retryOnRetryableErrors); + } + + /** + * Starts a shard split asynchronously and returns the Thread that runs it. + */ + commitAsync() { + jsTestLog("Running commitAsync command"); + + const donorRst = createRstArgs(this.donorSet); + const migrationIdString = extractUUIDFromObject(this.migrationId); + + const thread = new Thread(runCommitShardSplitAsync, + donorRst, + migrationIdString, + this.tenantIds, + this.recipientTagName, + this.recipientSetName); + thread.start(); + + return thread; + } + + /** + * Forgets a shard split synchronously. + */ + forget() { + jsTestLog("Running forgetShardSplit command"); + + const cmdObj = {forgetShardSplit: 1, migrationId: this.migrationId}; + + assert.commandWorked( + runShardSplitCommand(this.donorSet, cmdObj, true /* retryableOnErrors */)); + } + + /** + * Aborts a shard split synchronously. + */ + abort() { + jsTestLog("Running abort command"); + + const admin = this.donorSet.getPrimary().getDB("admin"); + + return admin.runCommand({abortShardSplit: 1, migrationId: this.migrationId}); + } +} + +/** + * Utility class to create a ReplicaSetTest that provides functionnality to run a shard split + * operation. + */ class BasicServerlessTest { constructor({recipientTagName, recipientSetName, quickGarbageCollection = false, nodeOptions}) { nodeOptions = nodeOptions || {}; @@ -29,6 +165,19 @@ class BasicServerlessTest { } /* + * Returns a ShardSplitOperation object to run a shard split. + * @param {tenantIds} tells which tenant ids to run a split for. + */ + createSplitOperation(tenantIds) { + const migrationId = UUID(); + jsTestLog("Asserting no state document exist before command"); + assert.isnull(findMigration(this.donor.getPrimary(), migrationId)); + + return new ShardSplitOperation( + this.donor, this.recipientSetName, this.recipientTagName, tenantIds, migrationId); + } + + /* * Add recipient nodes to the current donor set. * @param {numNodes} indicates the number of recipient nodes to be added. */ @@ -114,6 +263,16 @@ class BasicServerlessTest { } /* + * Takes an rstArgs produced by createArgs and remove the recipient nodes from it. + */ + removeRecipientsFromRstArgs(rstArgs) { + rstArgs.nodeHosts = rstArgs.nodeHosts.filter(nodeString => { + const port = parseInt(nodeString.split(":")[1]); + return !this.recipientNodes.some(node => node.port == port); + }); + } + + /* * Wait for state document garbage collection by polling for when the document has been removed * from the tenantSplitDonors namespace, and all access blockers have been removed. * @param {migrationId} id that was used for the commitShardSplit command. @@ -150,13 +309,6 @@ class BasicServerlessTest { })); } - forgetShardSplit(migrationId) { - jsTestLog("Running forgetShardSplit command"); - const cmdObj = {forgetShardSplit: 1, migrationId: migrationId}; - const primary = this.donor.getPrimary(); - return assert.commandWorked(primary.getDB('admin').runCommand(cmdObj)); - } - removeRecipientNodesFromDonor() { jsTestLog("Removing recipient nodes from the donor."); this.donor.nodes = this.donor.nodes.filter(node => !this.recipientNodes.includes(node)); @@ -200,18 +352,18 @@ class BasicServerlessTest { BasicServerlessTest.kConfigSplitDonorsNS = "config.tenantSplitDonors"; -function findMigration(primary, uuid) { +function findMigration(primary, migrationId) { const donorsCollection = primary.getCollection(BasicServerlessTest.kConfigSplitDonorsNS); - return donorsCollection.findOne({"_id": uuid}); + return donorsCollection.findOne({"_id": migrationId}); } -function cleanupMigrationDocument(primary, uuid) { +function cleanupMigrationDocument(primary, migrationId) { const donorsCollection = primary.getCollection(BasicServerlessTest.kConfigSplitDonorsNS); - return donorsCollection.deleteOne({"_id": uuid}, {w: "majority"}); + return donorsCollection.deleteOne({"_id": migrationId}, {w: "majority"}); } -function assertMigrationState(primary, uuid, state) { - const migrationDoc = findMigration(primary, uuid); +function assertMigrationState(primary, migrationId, state) { + const migrationDoc = findMigration(primary, migrationId); assert(migrationDoc); if (migrationDoc.state === 'aborted') { diff --git a/jstests/serverless/libs/shard_split_write_test.js b/jstests/serverless/libs/shard_split_write_test.js new file mode 100644 index 00000000000..eb3685eb298 --- /dev/null +++ b/jstests/serverless/libs/shard_split_write_test.js @@ -0,0 +1,43 @@ +/** + * + * Tests that runs a shard split to completion and tries to write before and during the split. + * @tags: [requires_fcv_52, featureFlagShardSplit] + */ + +load("jstests/libs/fail_point_util.js"); // for "configureFailPoint" +load('jstests/libs/parallel_shell_helpers.js'); // for "startParallelShell" +load("jstests/serverless/libs/basic_serverless_test.js"); + +/* + * Connects to a replica set and runs write operation, returning the results. + * @param {rstArgs} replicaSetArgs for the replica set to connect to. + * @param {tenantIds} perform a write operation for each tenantId. + */ +function doWriteOperations(rstArgs, tenantIds) { + load("jstests/replsets/rslib.js"); + + const donorRst = createRst(rstArgs, true); + const donorPrimary = donorRst.getPrimary(); + + const writeResults = []; + + tenantIds.forEach(id => { + const kDbName = `${id}_testDb`; + const kCollName = "testColl"; + const kNs = `${kDbName}.${kCollName}`; + + const res = donorPrimary.getDB(kDbName) + .getCollection(kNs) + .insert([{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], + {writeConcern: {w: "majority"}}) + .getRawResponse(); + if (res.writeErrors.length > 0) { + writeResults.push(res.writeErrors[0].code); + } else { + // Push OK + writeResults.push(0); + } + }); + + return writeResults; +} diff --git a/jstests/serverless/shard_split_basic_test.js b/jstests/serverless/shard_split_basic_test.js index ad22026c10e..dbf72f974ef 100644 --- a/jstests/serverless/shard_split_basic_test.js +++ b/jstests/serverless/shard_split_basic_test.js @@ -13,31 +13,27 @@ load("jstests/serverless/libs/basic_serverless_test.js"); const recipientTagName = "recipientNode"; const recipientSetName = "recipientSetName"; +const tenantIds = ["tenant1", "tenant2"]; + const test = new BasicServerlessTest({recipientTagName, recipientSetName, quickGarbageCollection: true}); test.addRecipientNodes(); test.donor.awaitSecondaryNodes(); -const migrationId = UUID(); +const operation = test.createSplitOperation(tenantIds); jsTestLog("Running the commitShardSplit operation"); -const admin = test.donor.getPrimary().getDB("admin"); -const tenantIds = ["tenant1", "tenant2"]; -assert.soon(() => { - const result = admin.runCommand( - {commitShardSplit: 1, migrationId, tenantIds, recipientTagName, recipientSetName}); - assert.commandWorked(result); - return result.state === 'committed'; -}); +assert.commandWorked(operation.commit()); test.removeRecipientNodesFromDonor(); // getPrimary can only be called once recipient nodes have been remove from test. -assertMigrationState(test.donor.getPrimary(), migrationId, "committed"); +assertMigrationState(test.donor.getPrimary(), operation.migrationId, "committed"); + +operation.forget(); -test.forgetShardSplit(migrationId); +test.waitForGarbageCollection(operation.migrationId, tenantIds); -test.waitForGarbageCollection(migrationId, tenantIds); test.stop(); })(); diff --git a/jstests/serverless/shard_split_reconfig.js b/jstests/serverless/shard_split_reconfig.js index b0177a6f06b..06d1a31fd6a 100644 --- a/jstests/serverless/shard_split_reconfig.js +++ b/jstests/serverless/shard_split_reconfig.js @@ -15,6 +15,7 @@ function shardSplitApplySplitConfig() { // Skip db hash check because secondary is left with a different config. TestData.skipCheckDBHashes = true; + const tenantIds = ["tenant1", "tenant2"]; const test = new BasicServerlessTest({ recipientTagName: "recipientNode", @@ -24,24 +25,14 @@ function shardSplitApplySplitConfig() { test.addRecipientNodes(); const donorPrimary = test.donor.getPrimary(); - const migrationId = UUID(); - const tenantIds = ["tenant1", "tenant2"]; - jsTestLog("Asserting no state document exist before command"); - assert.isnull(findMigration(donorPrimary, migrationId)); + const operation = test.createSplitOperation(tenantIds); jsTestLog("Running commitShardSplit command"); - const adminDb = donorPrimary.getDB("admin"); - assert.commandWorked(adminDb.runCommand({ - commitShardSplit: 1, - migrationId, - recipientTagName: test.recipientTagName, - recipientSetName: test.recipientSetName, - tenantIds - })); + assert.commandWorked(operation.commit()); jsTestLog("Asserting state document exist after command"); - assertMigrationState(donorPrimary, migrationId, "committed"); + assertMigrationState(donorPrimary, operation.migrationId, "committed"); test.removeRecipientNodesFromDonor(); @@ -53,9 +44,9 @@ function shardSplitApplySplitConfig() { assert.eq(configDoc["recipientConfig"]["_id"], "recipient"); assert.eq(configDoc["recipientConfig"]["members"].length, 3); - test.forgetShardSplit(migrationId); + operation.forget(); - test.waitForGarbageCollection(migrationId, tenantIds); + test.waitForGarbageCollection(operation.migrationId, tenantIds); test.stop(); } diff --git a/jstests/serverless/shard_split_rejects_multiple_ops.js b/jstests/serverless/shard_split_rejects_multiple_ops.js index da91999f823..35099bf0633 100644 --- a/jstests/serverless/shard_split_rejects_multiple_ops.js +++ b/jstests/serverless/shard_split_rejects_multiple_ops.js @@ -70,7 +70,8 @@ function commitShardSplitConcurrently() { 117); // ConflictingOperationInProgress test.removeRecipientNodesFromDonor(); - test.forgetShardSplit(migrationId); + assert.commandWorked( + donorPrimary.adminCommand({forgetShardSplit: 1, migrationId: migrationId})); // fails because the commitShardSplit hasn't be garbage collected yet. assert.commandFailedWithCode(donorPrimary.adminCommand({ @@ -111,7 +112,7 @@ function commitShardSplitAfterAbort() { const admin = test.donor.getPrimary().getDB("admin"); let fp = configureFailPoint(admin, "pauseShardSplitAfterBlocking"); - assert.commandWorked(admin.runCommand( + assert.commandFailed(admin.runCommand( {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds})); fp.wait(); diff --git a/jstests/serverless/shard_split_startup_recovery_aborted.js b/jstests/serverless/shard_split_startup_recovery_aborted.js index dceca6a7f5b..56016dc4ccf 100644 --- a/jstests/serverless/shard_split_startup_recovery_aborted.js +++ b/jstests/serverless/shard_split_startup_recovery_aborted.js @@ -41,7 +41,7 @@ let fp = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterDe const tenantIds = ["tenant5", "tenant6"]; -assert.commandWorked(donorPrimary.adminCommand( +assert.commandFailed(donorPrimary.adminCommand( {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds})); fp.wait(); diff --git a/jstests/serverless/shard_split_tenant_access_blocking.js b/jstests/serverless/shard_split_tenant_access_blocking.js index 814a8c6818f..5303deeaf75 100644 --- a/jstests/serverless/shard_split_tenant_access_blocking.js +++ b/jstests/serverless/shard_split_tenant_access_blocking.js @@ -25,11 +25,12 @@ const test = new BasicServerlessTest({ test.addRecipientNodes(); const donorPrimary = test.donor.getPrimary(); -const migrationId = UUID(); +const maxTimeMS = 1 * 2000; // 2 seconds const tenantIds = ["tenant1", "tenant2"]; jsTestLog("Asserting no state document exist before command"); -assert.isnull(findMigration(donorPrimary, migrationId)); +const operation = test.createSplitOperation(tenantIds); +assert.isnull(findMigration(donorPrimary, operation.migrationId)); jsTestLog("Asserting we can write before the migration"); tenantIds.forEach(id => { @@ -44,16 +45,12 @@ const adminDb = donorPrimary.getDB("admin"); const blockingFailPoint = configureFailPoint(adminDb, "pauseShardSplitAfterBlocking"); jsTestLog("Running commitShardSplit command"); -const awaitCommand = startParallelShell( - funWithArgs(function(migrationId, recipientTagName, recipientSetName, tenantIds) { - assert.commandWorked(db.adminCommand( - {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds})); - }, migrationId, test.recipientTagName, test.recipientSetName, tenantIds), donorPrimary.port); +const runThread = operation.commitAsync(); blockingFailPoint.wait(); jsTestLog("Asserting state document is in blocking state"); -assertMigrationState(donorPrimary, migrationId, "blocking"); +assertMigrationState(donorPrimary, operation.migrationId, "blocking"); jsTestLog("Asserting we cannot write in blocking state"); let writeThreads = []; @@ -82,16 +79,20 @@ assert.soon(function() { jsTestLog("Disabling failpoints and waiting for command to complete"); blockingFailPoint.off(); -awaitCommand(); + +runThread.join(); +const data = runThread.returnData(); +assert(data["ok"] == 1); writeThreads.forEach(thread => thread.join()); jsTestLog("Asserting state document exist after command"); -assertMigrationState(donorPrimary, migrationId, "committed"); +assertMigrationState(donorPrimary, operation.migrationId, "committed"); test.removeRecipientNodesFromDonor(); -test.forgetShardSplit(migrationId); -test.waitForGarbageCollection(migrationId, tenantIds); +operation.forget(); + +test.waitForGarbageCollection(operation.migrationId, tenantIds); test.stop(); })(); diff --git a/jstests/serverless/shard_split_wait_for_block_timestamp.js b/jstests/serverless/shard_split_wait_for_block_timestamp.js index 09a86891593..9399b46f909 100644 --- a/jstests/serverless/shard_split_wait_for_block_timestamp.js +++ b/jstests/serverless/shard_split_wait_for_block_timestamp.js @@ -41,27 +41,14 @@ for (let i = 0; i < 2000; i++) { assert.commandWorked(bulk.execute()); jsTestLog("Running commitShardSplit command"); -const firstOperationId = UUID(); -assert.isnull(findMigration(donorPrimary, firstOperationId)); -const awaitFirstSplitOperation = startParallelShell( - funWithArgs( - function(migrationId, recipientTagName, recipientSetName, tenantIds) { - assert.commandWorked(db.adminCommand( - {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds})); - }, - firstOperationId, - test.recipientTagName, - test.recipientSetName, - tenantIds), - donorPrimary.port); +const firstOperation = test.createSplitOperation(tenantIds); +assert.isnull(findMigration(donorPrimary, firstOperation.migrationId)); +const res = firstOperation.commit({retryOnRetryableErrors: false}); +assert.commandFailed(res); +assert.eq(res.code, ErrorCodes.TenantMigrationAborted); -awaitFirstSplitOperation(); -assertMigrationState(donorPrimary, firstOperationId, "aborted"); - -jsTestLog(`Running forgetShardSplit command: ${tojson(firstOperationId)}`); -test.forgetShardSplit(firstOperationId); - -test.waitForGarbageCollection(firstOperationId, tenantIds); +firstOperation.forget(); +test.waitForGarbageCollection(firstOperation.migrationId, tenantIds); jsTestLog("Restarting replication on recipient nodes, and running new split operation"); test.recipientNodes.forEach(node => restartServerReplication(node)); @@ -69,28 +56,14 @@ test.donor.awaitReplication(); test.donor.nodes.forEach( node => assert.commandWorked(setParameter(node, "shardSplitTimeoutMS", 60 * 1000))); -const secondOperationId = UUID(); -assert.isnull(findMigration(donorPrimary, secondOperationId)); -const awaitSecondSplitOperation = startParallelShell( - funWithArgs( - function(migrationId, recipientTagName, recipientSetName, tenantIds) { - assert.commandWorked(db.adminCommand( - {commitShardSplit: 1, migrationId, recipientTagName, recipientSetName, tenantIds})); - }, - secondOperationId, - test.recipientTagName, - test.recipientSetName, - tenantIds), - donorPrimary.port); - -awaitSecondSplitOperation(); -assertMigrationState(donorPrimary, secondOperationId, "committed"); +const secondOperation = test.createSplitOperation(tenantIds); +assert.isnull(findMigration(donorPrimary, secondOperation.migrationId)); +assert.commandWorked(secondOperation.commit()); +assertMigrationState(donorPrimary, secondOperation.migrationId, "committed"); test.removeRecipientNodesFromDonor(); +secondOperation.forget(); -jsTestLog(`Running forgetShardSplit command: ${tojson(secondOperationId)}`); -test.forgetShardSplit(secondOperationId); - -test.waitForGarbageCollection(secondOperationId, tenantIds); +test.waitForGarbageCollection(secondOperation.migrationId, tenantIds); test.stop(); })(); diff --git a/jstests/serverless/shard_split_write_during_aborted_split.js b/jstests/serverless/shard_split_write_during_aborted_split.js new file mode 100644 index 00000000000..5922954ab88 --- /dev/null +++ b/jstests/serverless/shard_split_write_during_aborted_split.js @@ -0,0 +1,69 @@ +/** + * + * Test that starts a shard split and abort it while doing a write. + * @tags: [requires_fcv_52, featureFlagShardSplit] + */ + +load("jstests/serverless/libs/shard_split_write_test.js"); + +(function() { +"use strict"; + +const recipientTagName = "recipientNode"; +const recipientSetName = "recipientSetName"; +const test = new BasicServerlessTest({ + recipientTagName, + recipientSetName, + nodeOptions: { + // Set a short timeout to test that the operation times out waiting for replication + setParameter: "shardSplitTimeoutMS=100000" + } +}); + +test.addRecipientNodes(); +test.donor.awaitSecondaryNodes(); + +const donorPrimary = test.donor.getPrimary(); +const tenantIds = ["tenant1", "tenant2"]; + +jsTestLog("Writing data before split"); +tenantIds.forEach(id => { + const kDbName = test.tenantDB(id, "testDb"); + const kCollName = "testColl"; + const kNs = `${kDbName}.${kCollName}`; + + assert.commandWorked(donorPrimary.getCollection(kNs).insert( + [{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], {writeConcern: {w: "majority"}})); +}); + +const operation = test.createSplitOperation(tenantIds); + +const blockingFP = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterBlocking"); + +const splitThread = operation.commitAsync(); + +blockingFP.wait(); + +const donorRst = createRstArgs(test.donor); +test.removeRecipientsFromRstArgs(donorRst); +const writeThread = new Thread(doWriteOperations, donorRst, tenantIds); +writeThread.start(); + +operation.abort(); + +blockingFP.off(); + +splitThread.join(); +const result = splitThread.returnData(); +assert.commandFailed(result); + +writeThread.join(); +const writeResults = writeThread.returnData(); +writeResults.forEach(res => { + assert.eq(res, ErrorCodes.OK); +}); + +TestData.skipCheckDBHashes = true; + +test.stop(); +})(); diff --git a/jstests/serverless/shard_split_write_during_shard_split.js b/jstests/serverless/shard_split_write_during_shard_split.js new file mode 100644 index 00000000000..193c13dcdd5 --- /dev/null +++ b/jstests/serverless/shard_split_write_during_shard_split.js @@ -0,0 +1,68 @@ +/** + * + * Tests that runs a shard split to completion and tries to write before and during the split. + * @tags: [requires_fcv_52, featureFlagShardSplit] + */ + +load("jstests/serverless/libs/shard_split_write_test.js"); + +(function() { +"use strict"; + +const recipientTagName = "recipientNode"; +const recipientSetName = "recipientSetName"; +const test = new BasicServerlessTest({ + recipientTagName, + recipientSetName, + nodeOptions: { + // Set a short timeout to test that the operation times out waiting for replication + setParameter: "shardSplitTimeoutMS=100000" + } +}); + +test.addRecipientNodes(); +test.donor.awaitSecondaryNodes(); + +const donorPrimary = test.donor.getPrimary(); +const tenantIds = ["tenant1", "tenant2"]; + +jsTestLog("Writing data before split"); +tenantIds.forEach(id => { + const kDbName = test.tenantDB(id, "testDb"); + const kCollName = "testColl"; + const kNs = `${kDbName}.${kCollName}`; + + assert.commandWorked(donorPrimary.getCollection(kNs).insert( + [{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], {writeConcern: {w: "majority"}})); +}); + +const operation = test.createSplitOperation(tenantIds); + +const blockingFP = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterBlocking"); + +const splitThread = operation.commitAsync(); + +blockingFP.wait(); + +const donorRst = createRstArgs(test.donor); +test.removeRecipientsFromRstArgs(donorRst); +const writeThread = new Thread(doWriteOperations, donorRst, tenantIds); +writeThread.start(); + +blockingFP.off(); + +splitThread.join(); +const result = splitThread.returnData(); +assert.eq(result.ok, 1); +assert.eq(result.state, "committed"); + +writeThread.join(); +const writeResults = writeThread.returnData(); +writeResults.forEach(res => { + assert.eq(res, ErrorCodes.TenantMigrationCommitted); +}); + +TestData.skipCheckDBHashes = true; + +test.stop(); +})(); diff --git a/jstests/serverless/shard_split_write_during_split_stepdown.js b/jstests/serverless/shard_split_write_during_split_stepdown.js new file mode 100644 index 00000000000..d147f09412b --- /dev/null +++ b/jstests/serverless/shard_split_write_during_split_stepdown.js @@ -0,0 +1,72 @@ +/** + * + * Tests that runs a shard split, a stepdown and writes operation simultaneously to verify the + * commands return the expected errors and success. + * result of write operations. + * @tags: [requires_fcv_52, featureFlagShardSplit] + */ + +load("jstests/serverless/libs/shard_split_write_test.js"); + +(function() { +"use strict"; + +const recipientTagName = "recipientNode"; +const recipientSetName = "recipientSetName"; +const test = new BasicServerlessTest({ + recipientTagName, + recipientSetName, + nodeOptions: { + // Set a short timeout to test that the operation times out waiting for replication + setParameter: "shardSplitTimeoutMS=100000" + } +}); + +test.addRecipientNodes(); +test.donor.awaitSecondaryNodes(); + +const donorPrimary = test.donor.getPrimary(); +const tenantIds = ["tenant1", "tenant2"]; + +jsTestLog("Writing data before split"); +tenantIds.forEach(id => { + const kDbName = test.tenantDB(id, "testDb"); + const kCollName = "testColl"; + const kNs = `${kDbName}.${kCollName}`; + + assert.commandWorked(donorPrimary.getCollection(kNs).insert( + [{_id: 0, x: 0}, {_id: 1, x: 1}, {_id: 2, x: 2}], {writeConcern: {w: "majority"}})); +}); + +const operation = test.createSplitOperation(tenantIds); + +const blockingFP = configureFailPoint(donorPrimary.getDB("admin"), "pauseShardSplitAfterBlocking"); + +const splitThread = operation.commitAsync(); + +blockingFP.wait(); + +const donorRst = createRstArgs(test.donor); +test.removeRecipientsFromRstArgs(donorRst); +const writeThread = new Thread(doWriteOperations, donorRst, tenantIds); +writeThread.start(); + +assert.commandWorked(donorPrimary.adminCommand({replSetStepDown: 360, force: true})); + +blockingFP.off(); + +splitThread.join(); +const result = splitThread.returnData(); +assert.eq(result.ok, 0); +assert.eq(result.code, ErrorCodes.InterruptedDueToReplStateChange); + +writeThread.join(); +const writeResults = writeThread.returnData(); +writeResults.forEach(res => { + assert.eq(res, ErrorCodes.TenantMigrationCommitted); +}); + +TestData.skipCheckDBHashes = true; + +test.stop(); +})(); diff --git a/src/mongo/db/serverless/shard_split_commands.cpp b/src/mongo/db/serverless/shard_split_commands.cpp index 27b010d4ee9..caf3a44cfde 100644 --- a/src/mongo/db/serverless/shard_split_commands.cpp +++ b/src/mongo/db/serverless/shard_split_commands.cpp @@ -77,9 +77,15 @@ public: auto state = donorPtr->decisionFuture().get(opCtx); - auto response = Response(state.state); + uassert(ErrorCodes::TenantMigrationAborted, + "The shard split operation was aborted. " + + (state.abortReason ? state.abortReason->toString() : ""), + state.state != ShardSplitDonorStateEnum::kAborted); + + Response response(state.state); if (state.abortReason) { BSONObjBuilder bob; + state.abortReason->serializeErrorToBSON(&bob); response.setAbortReason(bob.obj()); } diff --git a/src/mongo/db/serverless/shard_split_donor_service.cpp b/src/mongo/db/serverless/shard_split_donor_service.cpp index 5ff291c1470..633fc00e974 100644 --- a/src/mongo/db/serverless/shard_split_donor_service.cpp +++ b/src/mongo/db/serverless/shard_split_donor_service.cpp @@ -811,11 +811,12 @@ ShardSplitDonorService::DonorStateMachine::_handleErrorOrEnterAbortedState( } } - if (primaryToken.isCanceled()) { - return ExecutorFuture( - **executor, - StatusWith<DurableState>(ErrorCodes::InterruptedDueToReplStateChange, - "Interrupted due to replica set state change")); + const auto status = statusWithState.getStatus(); + if (ErrorCodes::isNotPrimaryError(status) || ErrorCodes::isShutdownError(status)) { + // Don't abort the split on retriable errors that may have been generated by the local + // server shutting/stepping down because it can be resumed when the client retries. + + return ExecutorFuture(**executor, statusWithState); } // There is no use to check the parent token the executor would not run if the parent token |