From 111c63d91c067b00199635c42243d43784e7145b Mon Sep 17 00:00:00 2001 From: Antonio Fuschetto Date: Mon, 7 Feb 2022 11:28:10 +0000 Subject: SERVER-63161 The recovery of the shard version and the migration in the moveChunk should be done in a separate thread --- jstests/sharding/migration_failure.js | 22 ++++++++--- src/mongo/db/s/migration_source_manager.cpp | 6 +-- src/mongo/db/s/migration_util.cpp | 60 ++++++++++++----------------- src/mongo/db/s/migration_util.h | 5 ++- 4 files changed, 46 insertions(+), 47 deletions(-) diff --git a/jstests/sharding/migration_failure.js b/jstests/sharding/migration_failure.js index f731c0d3614..6263d3bd9c3 100644 --- a/jstests/sharding/migration_failure.js +++ b/jstests/sharding/migration_failure.js @@ -7,6 +7,16 @@ (function() { 'use strict'; +function waitAndGetShardVersion(conn, collNs) { + var shardVersion = undefined; + assert.soon(() => { + shardVersion = conn.adminCommand({getShardVersion: collNs}).global; + return !(typeof shardVersion == 'string' && shardVersion == 'UNKNOWN'); + }); + + return shardVersion; +} + var st = new ShardingTest({shards: 2, mongos: 1}); var mongos = st.s0; @@ -30,12 +40,12 @@ var newVersion = null; assert.commandWorked(st.shard0.getDB("admin").runCommand( {configureFailPoint: 'failMigrationCommit', mode: 'alwaysOn'})); -oldVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global; +oldVersion = waitAndGetShardVersion(st.shard0, coll.toString()); assert.commandFailed( admin.runCommand({moveChunk: coll + "", find: {_id: 0}, to: st.shard1.shardName})); -newVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global; +newVersion = waitAndGetShardVersion(st.shard0, coll.toString()); assert.eq(oldVersion.t, newVersion.t, @@ -55,24 +65,24 @@ assert.commandWorked(st.shard0.getDB("admin").runCommand( // Run a migration where there will still be chunks in the collection remaining on the shard // afterwards. This will cause the collection's shardVersion to be bumped higher. -oldVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global; +oldVersion = waitAndGetShardVersion(st.shard0, coll.toString()); assert.commandWorked( admin.runCommand({moveChunk: coll + "", find: {_id: 1}, to: st.shard1.shardName})); -newVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global; +newVersion = waitAndGetShardVersion(st.shard0, coll.toString()); assert.lt(oldVersion.t, newVersion.t, "The major value in the shard version should have increased"); assert.eq(1, newVersion.i, "The minor value in the shard version should be 1"); // Run a migration to move off the shard's last chunk in the collection. The collection's // shardVersion will be reset. -oldVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global; +oldVersion = waitAndGetShardVersion(st.shard0, coll.toString()); assert.commandWorked( admin.runCommand({moveChunk: coll + "", find: {_id: -1}, to: st.shard1.shardName})); -newVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global; +newVersion = waitAndGetShardVersion(st.shard0, coll.toString()); assert.gt(oldVersion.t, newVersion.t, diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index b9b79f15d52..52735e850c5 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -411,7 +411,7 @@ void MigrationSourceManager::commitChunkOnRecipient() { invariant(_state == kCriticalSection); ScopeGuard scopedGuard([&] { _cleanupOnError(); - migrationutil::recoverMigrationUntilSuccess(_opCtx, _args.getNss()); + migrationutil::asyncRecoverMigrationUntilSuccessOrStepDown(_opCtx, _args.getNss()); }); // Tell the recipient shard to fetch the latest changes. @@ -436,7 +436,7 @@ void MigrationSourceManager::commitChunkMetadataOnConfig() { invariant(_state == kCloneCompleted); ScopeGuard scopedGuard([&] { _cleanupOnError(); - migrationutil::recoverMigrationUntilSuccess(_opCtx, _args.getNss()); + migrationutil::asyncRecoverMigrationUntilSuccessOrStepDown(_opCtx, _args.getNss()); }); // If we have chunks left on the FROM shard, bump the version of one of them as well. This will @@ -496,7 +496,7 @@ void MigrationSourceManager::commitChunkMetadataOnConfig() { } scopedGuard.dismiss(); _cleanup(false); - migrationutil::recoverMigrationUntilSuccess(_opCtx, _args.getNss()); + migrationutil::asyncRecoverMigrationUntilSuccessOrStepDown(_opCtx, _args.getNss()); uassertStatusOK(migrationCommitStatus); } diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp index 24ed3c8147d..63a6806b9fe 100644 --- a/src/mongo/db/s/migration_util.cpp +++ b/src/mongo/db/s/migration_util.cpp @@ -243,6 +243,8 @@ void retryIdempotentWorkAsPrimaryUntilSuccessOrStepdown( } void refreshFilteringMetadataUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) { + hangBeforeFilteringMetadataRefresh.pauseWhileSet(); + retryIdempotentWorkAsPrimaryUntilSuccessOrStepdown( opCtx, "refreshFilteringMetadataUntilSuccess", [&nss](OperationContext* newOpCtx) { hangInRefreshFilteringMetadataUntilSuccessInterruptible.pauseWhileSet(newOpCtx); @@ -905,26 +907,7 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) { CollectionShardingRuntime::get(opCtx, nss)->clearFilteringMetadata(opCtx); } - ExecutorFuture(getMigrationUtilExecutor(opCtx->getServiceContext())) - .then([serviceContext = opCtx->getServiceContext(), nss] { - ThreadClient tc("TriggerMigrationRecovery", serviceContext); - { - stdx::lock_guard lk(*tc.get()); - tc->setSystemOperationKillableByStepdown(lk); - } - - auto opCtx = tc->makeOperationContext(); - - hangBeforeFilteringMetadataRefresh.pauseWhileSet(); - - recoverMigrationUntilSuccess(opCtx.get(), nss); - }) - .onError([](const Status& status) { - LOGV2_WARNING(4798512, - "Error on deferred shardVersion recovery execution", - "error"_attr = redact(status)); - }) - .getAsync([](auto) {}); + asyncRecoverMigrationUntilSuccessOrStepDown(opCtx, nss); return true; }); @@ -1195,24 +1178,29 @@ void drainMigrationsPendingRecovery(OperationContext* opCtx) { } } -void recoverMigrationUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) noexcept { - try { - { - AutoGetCollection autoColl(opCtx, nss, MODE_IS); - auto* const csr = CollectionShardingRuntime::get(opCtx, nss); - if (csr->getCurrentMetadataIfKnown()) { - return; +void asyncRecoverMigrationUntilSuccessOrStepDown(OperationContext* opCtx, + const NamespaceString& nss) noexcept { + ExecutorFuture{Grid::get(opCtx)->getExecutorPool()->getFixedExecutor()} + .then([svcCtx{opCtx->getServiceContext()}, nss] { + ThreadClient tc{"MigrationRecovery", svcCtx}; + { + stdx::lock_guard lk{*tc.get()}; + tc->setSystemOperationKillableByStepdown(lk); } - } + auto uniqueOpCtx{tc->makeOperationContext()}; + auto opCtx{uniqueOpCtx.get()}; - refreshFilteringMetadataUntilSuccess(opCtx, nss); - } catch (const DBException& ex) { - LOGV2_DEBUG(6228200, - 2, - "Interrupted migration recovery", - "namespace"_attr = nss, - "error"_attr = redact(ex)); - } + try { + refreshFilteringMetadataUntilSuccess(opCtx, nss); + } catch (const DBException& ex) { + // This is expected in the event of a stepdown. + LOGV2(6316100, + "Interrupted deferred migration recovery", + "namespace"_attr = nss, + "error"_attr = redact(ex)); + } + }) + .getAsync([](auto) {}); } } // namespace migrationutil diff --git a/src/mongo/db/s/migration_util.h b/src/mongo/db/s/migration_util.h index 1593ffc64a0..ff2add3ba09 100644 --- a/src/mongo/db/s/migration_util.h +++ b/src/mongo/db/s/migration_util.h @@ -263,9 +263,10 @@ void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx); void drainMigrationsPendingRecovery(OperationContext* opCtx); /** - * Recovers the migration until it succeeds or the node steps down. + * Submits an asynchronous task to recover the migration until it succeeds or the node steps down. */ -void recoverMigrationUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) noexcept; +void asyncRecoverMigrationUntilSuccessOrStepDown(OperationContext* opCtx, + const NamespaceString& nss) noexcept; } // namespace migrationutil } // namespace mongo -- cgit v1.2.1