summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAntonio Fuschetto <antonio.fuschetto@mongodb.com>2022-02-07 11:28:10 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-02-07 11:55:28 +0000
commit111c63d91c067b00199635c42243d43784e7145b (patch)
treeaed51b9890194f89eb31b9b6fa885b72e9f72c75
parentab5508ea955f5b18521e0298a722535d153fb420 (diff)
downloadmongo-111c63d91c067b00199635c42243d43784e7145b.tar.gz
SERVER-63161 The recovery of the shard version and the migration in the moveChunk should be done in a separate thread
-rw-r--r--jstests/sharding/migration_failure.js22
-rw-r--r--src/mongo/db/s/migration_source_manager.cpp6
-rw-r--r--src/mongo/db/s/migration_util.cpp60
-rw-r--r--src/mongo/db/s/migration_util.h5
4 files changed, 46 insertions, 47 deletions
diff --git a/jstests/sharding/migration_failure.js b/jstests/sharding/migration_failure.js
index f731c0d3614..6263d3bd9c3 100644
--- a/jstests/sharding/migration_failure.js
+++ b/jstests/sharding/migration_failure.js
@@ -7,6 +7,16 @@
(function() {
'use strict';
+function waitAndGetShardVersion(conn, collNs) {
+ var shardVersion = undefined;
+ assert.soon(() => {
+ shardVersion = conn.adminCommand({getShardVersion: collNs}).global;
+ return !(typeof shardVersion == 'string' && shardVersion == 'UNKNOWN');
+ });
+
+ return shardVersion;
+}
+
var st = new ShardingTest({shards: 2, mongos: 1});
var mongos = st.s0;
@@ -30,12 +40,12 @@ var newVersion = null;
assert.commandWorked(st.shard0.getDB("admin").runCommand(
{configureFailPoint: 'failMigrationCommit', mode: 'alwaysOn'}));
-oldVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global;
+oldVersion = waitAndGetShardVersion(st.shard0, coll.toString());
assert.commandFailed(
admin.runCommand({moveChunk: coll + "", find: {_id: 0}, to: st.shard1.shardName}));
-newVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global;
+newVersion = waitAndGetShardVersion(st.shard0, coll.toString());
assert.eq(oldVersion.t,
newVersion.t,
@@ -55,24 +65,24 @@ assert.commandWorked(st.shard0.getDB("admin").runCommand(
// Run a migration where there will still be chunks in the collection remaining on the shard
// afterwards. This will cause the collection's shardVersion to be bumped higher.
-oldVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global;
+oldVersion = waitAndGetShardVersion(st.shard0, coll.toString());
assert.commandWorked(
admin.runCommand({moveChunk: coll + "", find: {_id: 1}, to: st.shard1.shardName}));
-newVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global;
+newVersion = waitAndGetShardVersion(st.shard0, coll.toString());
assert.lt(oldVersion.t, newVersion.t, "The major value in the shard version should have increased");
assert.eq(1, newVersion.i, "The minor value in the shard version should be 1");
// Run a migration to move off the shard's last chunk in the collection. The collection's
// shardVersion will be reset.
-oldVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global;
+oldVersion = waitAndGetShardVersion(st.shard0, coll.toString());
assert.commandWorked(
admin.runCommand({moveChunk: coll + "", find: {_id: -1}, to: st.shard1.shardName}));
-newVersion = st.shard0.getDB("admin").runCommand({getShardVersion: coll.toString()}).global;
+newVersion = waitAndGetShardVersion(st.shard0, coll.toString());
assert.gt(oldVersion.t,
newVersion.t,
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp
index b9b79f15d52..52735e850c5 100644
--- a/src/mongo/db/s/migration_source_manager.cpp
+++ b/src/mongo/db/s/migration_source_manager.cpp
@@ -411,7 +411,7 @@ void MigrationSourceManager::commitChunkOnRecipient() {
invariant(_state == kCriticalSection);
ScopeGuard scopedGuard([&] {
_cleanupOnError();
- migrationutil::recoverMigrationUntilSuccess(_opCtx, _args.getNss());
+ migrationutil::asyncRecoverMigrationUntilSuccessOrStepDown(_opCtx, _args.getNss());
});
// Tell the recipient shard to fetch the latest changes.
@@ -436,7 +436,7 @@ void MigrationSourceManager::commitChunkMetadataOnConfig() {
invariant(_state == kCloneCompleted);
ScopeGuard scopedGuard([&] {
_cleanupOnError();
- migrationutil::recoverMigrationUntilSuccess(_opCtx, _args.getNss());
+ migrationutil::asyncRecoverMigrationUntilSuccessOrStepDown(_opCtx, _args.getNss());
});
// If we have chunks left on the FROM shard, bump the version of one of them as well. This will
@@ -496,7 +496,7 @@ void MigrationSourceManager::commitChunkMetadataOnConfig() {
}
scopedGuard.dismiss();
_cleanup(false);
- migrationutil::recoverMigrationUntilSuccess(_opCtx, _args.getNss());
+ migrationutil::asyncRecoverMigrationUntilSuccessOrStepDown(_opCtx, _args.getNss());
uassertStatusOK(migrationCommitStatus);
}
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp
index 24ed3c8147d..63a6806b9fe 100644
--- a/src/mongo/db/s/migration_util.cpp
+++ b/src/mongo/db/s/migration_util.cpp
@@ -243,6 +243,8 @@ void retryIdempotentWorkAsPrimaryUntilSuccessOrStepdown(
}
void refreshFilteringMetadataUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) {
+ hangBeforeFilteringMetadataRefresh.pauseWhileSet();
+
retryIdempotentWorkAsPrimaryUntilSuccessOrStepdown(
opCtx, "refreshFilteringMetadataUntilSuccess", [&nss](OperationContext* newOpCtx) {
hangInRefreshFilteringMetadataUntilSuccessInterruptible.pauseWhileSet(newOpCtx);
@@ -905,26 +907,7 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) {
CollectionShardingRuntime::get(opCtx, nss)->clearFilteringMetadata(opCtx);
}
- ExecutorFuture<void>(getMigrationUtilExecutor(opCtx->getServiceContext()))
- .then([serviceContext = opCtx->getServiceContext(), nss] {
- ThreadClient tc("TriggerMigrationRecovery", serviceContext);
- {
- stdx::lock_guard<Client> lk(*tc.get());
- tc->setSystemOperationKillableByStepdown(lk);
- }
-
- auto opCtx = tc->makeOperationContext();
-
- hangBeforeFilteringMetadataRefresh.pauseWhileSet();
-
- recoverMigrationUntilSuccess(opCtx.get(), nss);
- })
- .onError([](const Status& status) {
- LOGV2_WARNING(4798512,
- "Error on deferred shardVersion recovery execution",
- "error"_attr = redact(status));
- })
- .getAsync([](auto) {});
+ asyncRecoverMigrationUntilSuccessOrStepDown(opCtx, nss);
return true;
});
@@ -1195,24 +1178,29 @@ void drainMigrationsPendingRecovery(OperationContext* opCtx) {
}
}
-void recoverMigrationUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) noexcept {
- try {
- {
- AutoGetCollection autoColl(opCtx, nss, MODE_IS);
- auto* const csr = CollectionShardingRuntime::get(opCtx, nss);
- if (csr->getCurrentMetadataIfKnown()) {
- return;
+void asyncRecoverMigrationUntilSuccessOrStepDown(OperationContext* opCtx,
+ const NamespaceString& nss) noexcept {
+ ExecutorFuture<void>{Grid::get(opCtx)->getExecutorPool()->getFixedExecutor()}
+ .then([svcCtx{opCtx->getServiceContext()}, nss] {
+ ThreadClient tc{"MigrationRecovery", svcCtx};
+ {
+ stdx::lock_guard<Client> lk{*tc.get()};
+ tc->setSystemOperationKillableByStepdown(lk);
}
- }
+ auto uniqueOpCtx{tc->makeOperationContext()};
+ auto opCtx{uniqueOpCtx.get()};
- refreshFilteringMetadataUntilSuccess(opCtx, nss);
- } catch (const DBException& ex) {
- LOGV2_DEBUG(6228200,
- 2,
- "Interrupted migration recovery",
- "namespace"_attr = nss,
- "error"_attr = redact(ex));
- }
+ try {
+ refreshFilteringMetadataUntilSuccess(opCtx, nss);
+ } catch (const DBException& ex) {
+ // This is expected in the event of a stepdown.
+ LOGV2(6316100,
+ "Interrupted deferred migration recovery",
+ "namespace"_attr = nss,
+ "error"_attr = redact(ex));
+ }
+ })
+ .getAsync([](auto) {});
}
} // namespace migrationutil
diff --git a/src/mongo/db/s/migration_util.h b/src/mongo/db/s/migration_util.h
index 1593ffc64a0..ff2add3ba09 100644
--- a/src/mongo/db/s/migration_util.h
+++ b/src/mongo/db/s/migration_util.h
@@ -263,9 +263,10 @@ void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx);
void drainMigrationsPendingRecovery(OperationContext* opCtx);
/**
- * Recovers the migration until it succeeds or the node steps down.
+ * Submits an asynchronous task to recover the migration until it succeeds or the node steps down.
*/
-void recoverMigrationUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) noexcept;
+void asyncRecoverMigrationUntilSuccessOrStepDown(OperationContext* opCtx,
+ const NamespaceString& nss) noexcept;
} // namespace migrationutil
} // namespace mongo