diff options
author | Jordi Serra Torrens <jordi.serra-torrens@mongodb.com> | 2021-12-30 11:05:11 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-30 11:29:35 +0000 |
commit | dd35d0eae5c81db28eb618ae0ae588e32a4a617a (patch) | |
tree | 497a3f8b4650765a220ae04ac2d07e1903a32b6f | |
parent | 23bf8408394c73fc143a8093105a688865f5cd4a (diff) | |
download | mongo-dd35d0eae5c81db28eb618ae0ae588e32a4a617a.tar.gz |
SERVER-62296 MoveChunk should recover any unfinished migration before starting a new one
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 4 | ||||
-rw-r--r-- | jstests/sharding/migration_recovers_unfinished_migrations.js | 81 | ||||
-rw-r--r-- | src/mongo/db/s/migration_source_manager.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/s/migration_util.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/s/migration_util.h | 6 |
5 files changed, 121 insertions, 0 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 834509102be..d8cd34bf4a1 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -134,6 +134,8 @@ last-continuous: test_file: jstests/replsets/tenant_migration_transaction_boundary.js - ticket: SERVER-62212 test_file: jstests/replsets/dbcheck_write_concern.js + - ticket: SERVER-62296 + test_file: jstests/sharding/migration_recovers_unfinished_migrations.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: @@ -407,6 +409,8 @@ last-lts: test_file: jstests/auth/dbcheck.js - ticket: SERVER-61666 test_file: jstests/replsets/tenant_migration_transaction_boundary.js + - ticket: SERVER-62296 + test_file: jstests/sharding/migration_recovers_unfinished_migrations.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/jstests/sharding/migration_recovers_unfinished_migrations.js b/jstests/sharding/migration_recovers_unfinished_migrations.js new file mode 100644 index 00000000000..15e22ee0005 --- /dev/null +++ b/jstests/sharding/migration_recovers_unfinished_migrations.js @@ -0,0 +1,81 @@ +/** + * Tests that while there is an unfinished migration pending recovery, if a new migration (of a + * different collection) attempts to start, it will first need to recover the unfinished migration. + */ +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load('jstests/libs/chunk_manipulation_util.js'); + +// Disable checking for index consistency to ensure that the config server doesn't trigger a +// StaleShardVersion exception on the shards and cause them to refresh their sharding metadata. That +// would interfere with the precise migration recovery interleaving this test requires. +const nodeOptions = { + setParameter: {enableShardedIndexConsistencyCheck: false} +}; + +// Disable balancer in order to prevent balancing rounds from triggering shard version refreshes on +// the shards that would interfere with the migration recovery interleaving this test requires. +var st = new ShardingTest({shards: 2, other: {configOptions: nodeOptions, enableBalancer: false}}); +let staticMongod = MongoRunner.runMongod({}); + +const dbName = "test"; +const collNameA = "foo"; +const collNameB = "bar"; +const nsA = dbName + "." + collNameA; +const nsB = dbName + "." + collNameB; + +assert.commandWorked( + st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName})); +assert.commandWorked(st.s.adminCommand({shardCollection: nsA, key: {_id: 1}})); +assert.commandWorked(st.s.adminCommand({shardCollection: nsB, key: {_id: 1}})); + +// Hang before commit migration +let moveChunkHangAtStep5Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep5"); +var joinMoveChunk1 = moveChunkParallel( + staticMongod, st.s0.host, {_id: 0}, null, nsA, st.shard1.shardName, true /* expectSuccess */); + +moveChunkHangAtStep5Failpoint.wait(); + +let migrationCommitNetworkErrorFailpoint = + configureFailPoint(st.rs0.getPrimary(), "migrationCommitNetworkError"); +let skipShardFilteringMetadataRefreshFailpoint = + configureFailPoint(st.rs0.getPrimary(), "skipShardFilteringMetadataRefresh"); + +moveChunkHangAtStep5Failpoint.off(); +joinMoveChunk1(); +migrationCommitNetworkErrorFailpoint.off(); +skipShardFilteringMetadataRefreshFailpoint.off(); + +// The migration is left pending recovery. +{ + let migrationCoordinatorDocuments = + st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray(); + assert.eq(1, migrationCoordinatorDocuments.length); + assert.eq(nsA, migrationCoordinatorDocuments[0].nss); +} + +// Start a second migration on a different collection and wait until it persists its recovery +// document. +let moveChunkHangAtStep3Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep3"); + +var joinMoveChunk2 = moveChunkParallel( + staticMongod, st.s0.host, {_id: 0}, null, nsB, st.shard1.shardName, true /* expectSuccess */); +moveChunkHangAtStep3Failpoint.wait(); + +// Check that the first migration has been recovered. There must be only one +// config.migrationCoordinators document, which corresponds to the second migration. +{ + let migrationCoordinatorDocuments = + st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray(); + assert.eq(1, migrationCoordinatorDocuments.length); + assert.eq(nsB, migrationCoordinatorDocuments[0].nss); +} + +moveChunkHangAtStep3Failpoint.off(); +joinMoveChunk2(); + +MongoRunner.stopMongod(staticMongod); +st.stop(); +})(); diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 0a0f0ab82e4..4d2be1c2081 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -176,6 +176,18 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, // command. onShardVersionMismatch(_opCtx, _args.getNss(), boost::none); + // Complete any unfinished migration pending recovery + { + migrationutil::drainMigrationsPendingRecovery(opCtx); + + // Since the moveChunk command is holding the ActiveMigrationRegistry and we just drained + // all migrations pending recovery, now there cannot be any document in + // config.migrationCoordinators. + PersistentTaskStore<MigrationCoordinatorDocument> store( + NamespaceString::kMigrationCoordinatorsNamespace); + invariant(store.count(opCtx) == 0); + } + // Snapshot the committed metadata from the time the migration starts const auto [collectionMetadata, collectionUUID] = [&] { UninterruptibleLockGuard noInterrupt(_opCtx->lockState()); diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp index 05896b00291..61fe9f728eb 100644 --- a/src/mongo/db/s/migration_util.cpp +++ b/src/mongo/db/s/migration_util.cpp @@ -1177,5 +1177,23 @@ void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx) { "ongoingRecipientCritSecCount"_attr = ongoingMigrationRecipientsCount); } +void drainMigrationsPendingRecovery(OperationContext* opCtx) { + PersistentTaskStore<MigrationCoordinatorDocument> store( + NamespaceString::kMigrationCoordinatorsNamespace); + + while (store.count(opCtx)) { + store.forEach(opCtx, BSONObj(), [opCtx](const MigrationCoordinatorDocument& doc) { + try { + onShardVersionMismatch(opCtx, doc.getNss(), boost::none); + } catch (DBException& ex) { + ex.addContext(str::stream() << "Failed to recover pending migration for document " + << doc.toBSON()); + throw; + } + return true; + }); + } +} + } // namespace migrationutil } // namespace mongo diff --git a/src/mongo/db/s/migration_util.h b/src/mongo/db/s/migration_util.h index 2b16a601bb2..4b956c8a6ab 100644 --- a/src/mongo/db/s/migration_util.h +++ b/src/mongo/db/s/migration_util.h @@ -254,5 +254,11 @@ void deleteMigrationRecipientRecoveryDocument(OperationContext* opCtx, const UUI */ void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx); +/** + * Recovers all unfinished migrations pending recovery. + * Note: This method assumes its caller is preventing new migrations from starting. + */ +void drainMigrationsPendingRecovery(OperationContext* opCtx); + } // namespace migrationutil } // namespace mongo |