diff options
author | Jordi Serra Torrens <jordi.serra-torrens@mongodb.com> | 2021-12-30 12:18:25 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-30 12:42:27 +0000 |
commit | 8e6ab9a259d921298940190161fadfd118c6dc15 (patch) | |
tree | 12d797530f670dc101c46c2f73d23a50a52e8f5b /src/mongo/db | |
parent | dd35d0eae5c81db28eb618ae0ae588e32a4a617a (diff) | |
download | mongo-8e6ab9a259d921298940190161fadfd118c6dc15.tar.gz |
SERVER-62245 MigrationRecovery must not assume that only one migration needs to be recovered
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/s/migration_coordinator.cpp | 41 | ||||
-rw-r--r-- | src/mongo/db/s/migration_coordinator.h | 2 | ||||
-rw-r--r-- | src/mongo/db/s/migration_util.cpp | 25 |
3 files changed, 44 insertions, 24 deletions
diff --git a/src/mongo/db/s/migration_coordinator.cpp b/src/mongo/db/s/migration_coordinator.cpp index 5b983056cf3..dfab697554e 100644 --- a/src/mongo/db/s/migration_coordinator.cpp +++ b/src/mongo/db/s/migration_coordinator.cpp @@ -90,7 +90,7 @@ MigrationCoordinator::MigrationCoordinator(MigrationSessionId sessionId, _waitForDelete(waitForDelete) {} MigrationCoordinator::MigrationCoordinator(const MigrationCoordinatorDocument& doc) - : _migrationInfo(doc) {} + : _migrationInfo(doc), _recoveringMigration(true) {} MigrationCoordinator::~MigrationCoordinator() = default; @@ -208,10 +208,26 @@ SemiFuture<void> MigrationCoordinator::_commitMigrationOnDonorAndRecipient( "lsid"_attr = _migrationInfo.getLsid(), "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(), "migrationId"_attr = _migrationInfo.getId()); - migrationutil::advanceTransactionOnRecipient(opCtx, - _migrationInfo.getRecipientShardId(), - _migrationInfo.getLsid(), - _migrationInfo.getTxnNumber()); + try { + migrationutil::advanceTransactionOnRecipient(opCtx, + _migrationInfo.getRecipientShardId(), + _migrationInfo.getLsid(), + _migrationInfo.getTxnNumber()); + } catch (const ExceptionFor<ErrorCodes::TransactionTooOld>& ex) { + // TODO: SERVER-62316: No longer catch after 6.0 branches out + if (_recoveringMigration) { + LOGV2_WARNING(6224500, + "Transaction number on recipient shard was already advanced by a later " + "migration that started before this one finished recovery", + "namespace"_attr = _migrationInfo.getNss(), + "migrationId"_attr = _migrationInfo.getId(), + "lsid"_attr = _migrationInfo.getLsid(), + "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(), + "error"_attr = redact(ex)); + } else { + throw; + } + } hangBeforeSendingCommitDecision.pauseWhileSet(); @@ -293,8 +309,21 @@ void MigrationCoordinator::_abortMigrationOnDonorAndRecipient(OperationContext* "recipientShardId"_attr = _migrationInfo.getRecipientShardId(), "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(), "error"_attr = exShardNotFound); + } catch (const ExceptionFor<ErrorCodes::TransactionTooOld>& ex) { + // TODO: SERVER-62316: No longer catch after 6.0 branches out + if (_recoveringMigration) { + LOGV2_WARNING(6224501, + "Transaction number on recipient shard was already advanced by a later " + "migration that started before this one finished recovery", + "namespace"_attr = _migrationInfo.getNss(), + "migrationId"_attr = _migrationInfo.getId(), + "lsid"_attr = _migrationInfo.getLsid(), + "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(), + "error"_attr = redact(ex)); + } else { + throw; + } } - LOGV2_DEBUG(23902, 2, "Marking range deletion task on recipient as ready for processing", diff --git a/src/mongo/db/s/migration_coordinator.h b/src/mongo/db/s/migration_coordinator.h index 0c96d1eab15..88db1a68008 100644 --- a/src/mongo/db/s/migration_coordinator.h +++ b/src/mongo/db/s/migration_coordinator.h @@ -125,6 +125,8 @@ private: MigrationCoordinatorDocument _migrationInfo; bool _waitForDelete = false; boost::optional<ExecutorFuture<void>> _releaseRecipientCriticalSectionFuture; + const bool _recoveringMigration = + false; // TODO: SERVER-62316: Can be removed after 6.0 branches out }; } // namespace migrationutil diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp index 61fe9f728eb..fa60f7990a8 100644 --- a/src/mongo/db/s/migration_util.cpp +++ b/src/mongo/db/s/migration_util.cpp @@ -816,7 +816,12 @@ void markAsReadyRangeDeletionTaskLocally(OperationContext* opCtx, const UUID& mi auto update = BSON("$unset" << BSON(RangeDeletionTask::kPendingFieldName << "")); hangInReadyRangeDeletionLocallyInterruptible.pauseWhileSet(opCtx); - store.update(opCtx, query, update); + try { + store.update(opCtx, query, update); + } catch (const ExceptionFor<ErrorCodes::NoMatchingDocument>&) { + // If we are recovering the migration, the range-deletion may have already finished. So its + // associated document may already have been removed. + } if (hangInReadyRangeDeletionLocallyThenSimulateErrorUninterruptible.shouldFail()) { hangInReadyRangeDeletionLocallyThenSimulateErrorUninterruptible.pauseWhileSet(opCtx); @@ -884,16 +889,6 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) { store.forEach(opCtx, BSONObj{}, [&opCtx, &unfinishedMigrationsCount](const MigrationCoordinatorDocument& doc) { - // MigrationCoordinators are only created under the MigrationBlockingGuard, - // which means that only one can possibly exist on an instance at a time. - // Furthermore, recovery of an incomplete MigrationCoordator also acquires the - // MigrationBlockingGuard. Because of this it is not possible to have more - // than one unfinished migration. - invariant(unfinishedMigrationsCount == 0, - str::stream() - << "Upon step-up a second migration coordinator was found" - << redact(doc.toBSON())); - unfinishedMigrationsCount++; LOGV2_DEBUG(4798511, 3, @@ -908,14 +903,8 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) { CollectionShardingRuntime::get(opCtx, nss)->clearFilteringMetadata(opCtx); } - auto mbg = std::make_shared<MigrationBlockingGuard>( - opCtx, - str::stream() << "Recovery of migration session " - << doc.getMigrationSessionId().toString() - << " on collection " << nss); - ExecutorFuture<void>(getMigrationUtilExecutor(opCtx->getServiceContext())) - .then([serviceContext = opCtx->getServiceContext(), nss, mbg] { + .then([serviceContext = opCtx->getServiceContext(), nss] { ThreadClient tc("TriggerMigrationRecovery", serviceContext); { stdx::lock_guard<Client> lk(*tc.get()); |