diff options
author | Jordi Serra Torrens <jordi.serra-torrens@mongodb.com> | 2021-12-30 11:05:11 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-30 15:03:57 +0000 |
commit | d5618f96bdd4630736d8441b38a7287adff8aed7 (patch) | |
tree | d03935d8985251dc5c31dc609910fb30f2a757ce | |
parent | 1e5c94ab5444a6301974c5a2c4dda9b8ae325168 (diff) | |
download | mongo-d5618f96bdd4630736d8441b38a7287adff8aed7.tar.gz |
SERVER-62296 MoveChunk should recover any unfinished migration before starting a new one
(cherry picked from commit dd35d0eae5c81db28eb618ae0ae588e32a4a617a)
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 4 | ||||
-rw-r--r-- | jstests/sharding/migration_recovers_unfinished_migrations.js | 81 | ||||
-rw-r--r-- | src/mongo/db/s/migration_source_manager.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/s/migration_util.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/s/migration_util.h | 6 |
5 files changed, 121 insertions, 0 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 2639986bf27..5211090cfec 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -82,6 +82,8 @@ last-continuous: test_file: jstests/auth/dbcheck.js - ticket: SERVER-56127 test_file: jstests/sharding/retryable_writes_nested_shard_key.js + - ticket: SERVER-62296 + test_file: jstests/sharding/migration_recovers_unfinished_migrations.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: @@ -320,6 +322,8 @@ last-lts: test_file: jstests/sharding/retryable_writes_nested_shard_key.js - ticket: SERVER-62212 test_file: jstests/replsets/dbcheck_write_concern.js + - ticket: SERVER-62296 + test_file: jstests/sharding/migration_recovers_unfinished_migrations.js # Tests that should only be excluded from particular suites should be listed under that suite. diff --git a/jstests/sharding/migration_recovers_unfinished_migrations.js b/jstests/sharding/migration_recovers_unfinished_migrations.js new file mode 100644 index 00000000000..15e22ee0005 --- /dev/null +++ b/jstests/sharding/migration_recovers_unfinished_migrations.js @@ -0,0 +1,81 @@ +/** + * Tests that while there is an unfinished migration pending recovery, if a new migration (of a + * different collection) attempts to start, it will first need to recover the unfinished migration. + */ +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load('jstests/libs/chunk_manipulation_util.js'); + +// Disable checking for index consistency to ensure that the config server doesn't trigger a +// StaleShardVersion exception on the shards and cause them to refresh their sharding metadata. That +// would interfere with the precise migration recovery interleaving this test requires. +const nodeOptions = { + setParameter: {enableShardedIndexConsistencyCheck: false} +}; + +// Disable balancer in order to prevent balancing rounds from triggering shard version refreshes on +// the shards that would interfere with the migration recovery interleaving this test requires. +var st = new ShardingTest({shards: 2, other: {configOptions: nodeOptions, enableBalancer: false}}); +let staticMongod = MongoRunner.runMongod({}); + +const dbName = "test"; +const collNameA = "foo"; +const collNameB = "bar"; +const nsA = dbName + "." + collNameA; +const nsB = dbName + "." + collNameB; + +assert.commandWorked( + st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName})); +assert.commandWorked(st.s.adminCommand({shardCollection: nsA, key: {_id: 1}})); +assert.commandWorked(st.s.adminCommand({shardCollection: nsB, key: {_id: 1}})); + +// Hang before commit migration +let moveChunkHangAtStep5Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep5"); +var joinMoveChunk1 = moveChunkParallel( + staticMongod, st.s0.host, {_id: 0}, null, nsA, st.shard1.shardName, true /* expectSuccess */); + +moveChunkHangAtStep5Failpoint.wait(); + +let migrationCommitNetworkErrorFailpoint = + configureFailPoint(st.rs0.getPrimary(), "migrationCommitNetworkError"); +let skipShardFilteringMetadataRefreshFailpoint = + configureFailPoint(st.rs0.getPrimary(), "skipShardFilteringMetadataRefresh"); + +moveChunkHangAtStep5Failpoint.off(); +joinMoveChunk1(); +migrationCommitNetworkErrorFailpoint.off(); +skipShardFilteringMetadataRefreshFailpoint.off(); + +// The migration is left pending recovery. +{ + let migrationCoordinatorDocuments = + st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray(); + assert.eq(1, migrationCoordinatorDocuments.length); + assert.eq(nsA, migrationCoordinatorDocuments[0].nss); +} + +// Start a second migration on a different collection and wait until it persists its recovery +// document. +let moveChunkHangAtStep3Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep3"); + +var joinMoveChunk2 = moveChunkParallel( + staticMongod, st.s0.host, {_id: 0}, null, nsB, st.shard1.shardName, true /* expectSuccess */); +moveChunkHangAtStep3Failpoint.wait(); + +// Check that the first migration has been recovered. There must be only one +// config.migrationCoordinators document, which corresponds to the second migration. +{ + let migrationCoordinatorDocuments = + st.rs0.getPrimary().getDB('config')['migrationCoordinators'].find().toArray(); + assert.eq(1, migrationCoordinatorDocuments.length); + assert.eq(nsB, migrationCoordinatorDocuments[0].nss); +} + +moveChunkHangAtStep3Failpoint.off(); +joinMoveChunk2(); + +MongoRunner.stopMongod(staticMongod); +st.stop(); +})(); diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 7c986e299fc..c04052c7cad 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -144,6 +144,18 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, // command. onShardVersionMismatch(_opCtx, getNss(), boost::none); + // Complete any unfinished migration pending recovery + { + migrationutil::drainMigrationsPendingRecovery(opCtx); + + // Since the moveChunk command is holding the ActiveMigrationRegistry and we just drained + // all migrations pending recovery, now there cannot be any document in + // config.migrationCoordinators. + PersistentTaskStore<MigrationCoordinatorDocument> store( + NamespaceString::kMigrationCoordinatorsNamespace); + invariant(store.count(opCtx) == 0); + } + // Snapshot the committed metadata from the time the migration starts const auto [collectionMetadata, collectionUUID] = [&] { UninterruptibleLockGuard noInterrupt(_opCtx->lockState()); diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp index 1e8d23c6941..028c784ebf7 100644 --- a/src/mongo/db/s/migration_util.cpp +++ b/src/mongo/db/s/migration_util.cpp @@ -1045,5 +1045,23 @@ void recoverMigrationCoordinations(OperationContext* opCtx, NamespaceString nss) }); } +void drainMigrationsPendingRecovery(OperationContext* opCtx) { + PersistentTaskStore<MigrationCoordinatorDocument> store( + NamespaceString::kMigrationCoordinatorsNamespace); + + while (store.count(opCtx)) { + store.forEach(opCtx, BSONObj(), [opCtx](const MigrationCoordinatorDocument& doc) { + try { + onShardVersionMismatch(opCtx, doc.getNss(), boost::none); + } catch (DBException& ex) { + ex.addContext(str::stream() << "Failed to recover pending migration for document " + << doc.toBSON()); + throw; + } + return true; + }); + } +} + } // namespace migrationutil } // namespace mongo diff --git a/src/mongo/db/s/migration_util.h b/src/mongo/db/s/migration_util.h index d779fd3779e..e5da12dcfd4 100644 --- a/src/mongo/db/s/migration_util.h +++ b/src/mongo/db/s/migration_util.h @@ -225,5 +225,11 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx); */ void recoverMigrationCoordinations(OperationContext* opCtx, NamespaceString nss); +/** + * Recovers all unfinished migrations pending recovery. + * Note: This method assumes its caller is preventing new migrations from starting. + */ +void drainMigrationsPendingRecovery(OperationContext* opCtx); + } // namespace migrationutil } // namespace mongo |