summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordi Serra Torrens <jordi.serra-torrens@mongodb.com>2021-12-30 11:05:11 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-30 11:29:35 +0000
commitdd35d0eae5c81db28eb618ae0ae588e32a4a617a (patch)
tree497a3f8b4650765a220ae04ac2d07e1903a32b6f
parent23bf8408394c73fc143a8093105a688865f5cd4a (diff)
downloadmongo-dd35d0eae5c81db28eb618ae0ae588e32a4a617a.tar.gz
SERVER-62296 MoveChunk should recover any unfinished migration before starting a new one
-rw-r--r--etc/backports_required_for_multiversion_tests.yml4
-rw-r--r--jstests/sharding/migration_recovers_unfinished_migrations.js81
-rw-r--r--src/mongo/db/s/migration_source_manager.cpp12
-rw-r--r--src/mongo/db/s/migration_util.cpp18
-rw-r--r--src/mongo/db/s/migration_util.h6
5 files changed, 121 insertions, 0 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 834509102be..d8cd34bf4a1 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -134,6 +134,8 @@ last-continuous:
test_file: jstests/replsets/tenant_migration_transaction_boundary.js
- ticket: SERVER-62212
test_file: jstests/replsets/dbcheck_write_concern.js
+ - ticket: SERVER-62296
+ test_file: jstests/sharding/migration_recovers_unfinished_migrations.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
@@ -407,6 +409,8 @@ last-lts:
test_file: jstests/auth/dbcheck.js
- ticket: SERVER-61666
test_file: jstests/replsets/tenant_migration_transaction_boundary.js
+ - ticket: SERVER-62296
+ test_file: jstests/sharding/migration_recovers_unfinished_migrations.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
diff --git a/jstests/sharding/migration_recovers_unfinished_migrations.js b/jstests/sharding/migration_recovers_unfinished_migrations.js
new file mode 100644
index 00000000000..15e22ee0005
--- /dev/null
+++ b/jstests/sharding/migration_recovers_unfinished_migrations.js
@@ -0,0 +1,81 @@
+/**
+ * Tests that while there is an unfinished migration pending recovery, if a new migration (of a
+ * different collection) attempts to start, it will first need to recover the unfinished migration.
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load('jstests/libs/chunk_manipulation_util.js');
+
+// Disable checking for index consistency to ensure that the config server doesn't trigger a
+// StaleShardVersion exception on the shards and cause them to refresh their sharding metadata. That
+// would interfere with the precise migration recovery interleaving this test requires.
+const nodeOptions = {
+ setParameter: {enableShardedIndexConsistencyCheck: false}
+};
+
+// Disable balancer in order to prevent balancing rounds from triggering shard version refreshes on
+// the shards that would interfere with the migration recovery interleaving this test requires.
+var st = new ShardingTest({shards: 2, other: {configOptions: nodeOptions, enableBalancer: false}});
+let staticMongod = MongoRunner.runMongod({});
+
+const dbName = "test";
+const collNameA = "foo";
+const collNameB = "bar";
+const nsA = dbName + "." + collNameA;
+const nsB = dbName + "." + collNameB;
+
+assert.commandWorked(
+ st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName}));
+assert.commandWorked(st.s.adminCommand({shardCollection: nsA, key: {_id: 1}}));
+assert.commandWorked(st.s.adminCommand({shardCollection: nsB, key: {_id: 1}}));
+
+// Hang before commit migration
+let moveChunkHangAtStep5Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep5");
+var joinMoveChunk1 = moveChunkParallel(
+ staticMongod, st.s0.host, {_id: 0}, null, nsA, st.shard1.shardName, true /* expectSuccess */);
+
+moveChunkHangAtStep5Failpoint.wait();
+
+let migrationCommitNetworkErrorFailpoint =
+ configureFailPoint(st.rs0.getPrimary(), "migrationCommitNetworkError");
+let skipShardFilteringMetadataRefreshFailpoint =
+ configureFailPoint(st.rs0.getPrimary(), "skipShardFilteringMetadataRefresh");
+
+moveChunkHangAtStep5Failpoint.off();
+joinMoveChunk1();
+migrationCommitNetworkErrorFailpoint.off();
+skipShardFilteringMetadataRefreshFailpoint.off();
+
+// The migration is left pending recovery.
+{
+ let migrationCoordinatorDocuments =
+ st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray();
+ assert.eq(1, migrationCoordinatorDocuments.length);
+ assert.eq(nsA, migrationCoordinatorDocuments[0].nss);
+}
+
+// Start a second migration on a different collection and wait until it persists its recovery
+// document.
+let moveChunkHangAtStep3Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep3");
+
+var joinMoveChunk2 = moveChunkParallel(
+ staticMongod, st.s0.host, {_id: 0}, null, nsB, st.shard1.shardName, true /* expectSuccess */);
+moveChunkHangAtStep3Failpoint.wait();
+
+// Check that the first migration has been recovered. There must be only one
+// config.migrationCoordinators document, which corresponds to the second migration.
+{
+ let migrationCoordinatorDocuments =
+ st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray();
+ assert.eq(1, migrationCoordinatorDocuments.length);
+ assert.eq(nsB, migrationCoordinatorDocuments[0].nss);
+}
+
+moveChunkHangAtStep3Failpoint.off();
+joinMoveChunk2();
+
+MongoRunner.stopMongod(staticMongod);
+st.stop();
+})();
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp
index 0a0f0ab82e4..4d2be1c2081 100644
--- a/src/mongo/db/s/migration_source_manager.cpp
+++ b/src/mongo/db/s/migration_source_manager.cpp
@@ -176,6 +176,18 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
// command.
onShardVersionMismatch(_opCtx, _args.getNss(), boost::none);
+ // Complete any unfinished migration pending recovery
+ {
+ migrationutil::drainMigrationsPendingRecovery(opCtx);
+
+ // Since the moveChunk command is holding the ActiveMigrationRegistry and we just drained
+ // all migrations pending recovery, now there cannot be any document in
+ // config.migrationCoordinators.
+ PersistentTaskStore<MigrationCoordinatorDocument> store(
+ NamespaceString::kMigrationCoordinatorsNamespace);
+ invariant(store.count(opCtx) == 0);
+ }
+
// Snapshot the committed metadata from the time the migration starts
const auto [collectionMetadata, collectionUUID] = [&] {
UninterruptibleLockGuard noInterrupt(_opCtx->lockState());
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp
index 05896b00291..61fe9f728eb 100644
--- a/src/mongo/db/s/migration_util.cpp
+++ b/src/mongo/db/s/migration_util.cpp
@@ -1177,5 +1177,23 @@ void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx) {
"ongoingRecipientCritSecCount"_attr = ongoingMigrationRecipientsCount);
}
+void drainMigrationsPendingRecovery(OperationContext* opCtx) {
+ PersistentTaskStore<MigrationCoordinatorDocument> store(
+ NamespaceString::kMigrationCoordinatorsNamespace);
+
+ while (store.count(opCtx)) {
+ store.forEach(opCtx, BSONObj(), [opCtx](const MigrationCoordinatorDocument& doc) {
+ try {
+ onShardVersionMismatch(opCtx, doc.getNss(), boost::none);
+ } catch (DBException& ex) {
+ ex.addContext(str::stream() << "Failed to recover pending migration for document "
+ << doc.toBSON());
+ throw;
+ }
+ return true;
+ });
+ }
+}
+
} // namespace migrationutil
} // namespace mongo
diff --git a/src/mongo/db/s/migration_util.h b/src/mongo/db/s/migration_util.h
index 2b16a601bb2..4b956c8a6ab 100644
--- a/src/mongo/db/s/migration_util.h
+++ b/src/mongo/db/s/migration_util.h
@@ -254,5 +254,11 @@ void deleteMigrationRecipientRecoveryDocument(OperationContext* opCtx, const UUI
*/
void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx);
+/**
+ * Recovers all unfinished migrations pending recovery.
+ * Note: This method assumes its caller is preventing new migrations from starting.
+ */
+void drainMigrationsPendingRecovery(OperationContext* opCtx);
+
} // namespace migrationutil
} // namespace mongo