summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
authorJordi Serra Torrens <jordi.serra-torrens@mongodb.com>2021-12-30 12:18:25 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-30 12:42:27 +0000
commit8e6ab9a259d921298940190161fadfd118c6dc15 (patch)
tree12d797530f670dc101c46c2f73d23a50a52e8f5b /src/mongo/db
parentdd35d0eae5c81db28eb618ae0ae588e32a4a617a (diff)
downloadmongo-8e6ab9a259d921298940190161fadfd118c6dc15.tar.gz
SERVER-62245 MigrationRecovery must not assume that only one migration needs to be recovered
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/s/migration_coordinator.cpp41
-rw-r--r--src/mongo/db/s/migration_coordinator.h2
-rw-r--r--src/mongo/db/s/migration_util.cpp25
3 files changed, 44 insertions, 24 deletions
diff --git a/src/mongo/db/s/migration_coordinator.cpp b/src/mongo/db/s/migration_coordinator.cpp
index 5b983056cf3..dfab697554e 100644
--- a/src/mongo/db/s/migration_coordinator.cpp
+++ b/src/mongo/db/s/migration_coordinator.cpp
@@ -90,7 +90,7 @@ MigrationCoordinator::MigrationCoordinator(MigrationSessionId sessionId,
_waitForDelete(waitForDelete) {}
MigrationCoordinator::MigrationCoordinator(const MigrationCoordinatorDocument& doc)
- : _migrationInfo(doc) {}
+ : _migrationInfo(doc), _recoveringMigration(true) {}
MigrationCoordinator::~MigrationCoordinator() = default;
@@ -208,10 +208,26 @@ SemiFuture<void> MigrationCoordinator::_commitMigrationOnDonorAndRecipient(
"lsid"_attr = _migrationInfo.getLsid(),
"currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
"migrationId"_attr = _migrationInfo.getId());
- migrationutil::advanceTransactionOnRecipient(opCtx,
- _migrationInfo.getRecipientShardId(),
- _migrationInfo.getLsid(),
- _migrationInfo.getTxnNumber());
+ try {
+ migrationutil::advanceTransactionOnRecipient(opCtx,
+ _migrationInfo.getRecipientShardId(),
+ _migrationInfo.getLsid(),
+ _migrationInfo.getTxnNumber());
+ } catch (const ExceptionFor<ErrorCodes::TransactionTooOld>& ex) {
+ // TODO: SERVER-62316: No longer catch after 6.0 branches out
+ if (_recoveringMigration) {
+ LOGV2_WARNING(6224500,
+ "Transaction number on recipient shard was already advanced by a later "
+ "migration that started before this one finished recovery",
+ "namespace"_attr = _migrationInfo.getNss(),
+ "migrationId"_attr = _migrationInfo.getId(),
+ "lsid"_attr = _migrationInfo.getLsid(),
+ "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
+ "error"_attr = redact(ex));
+ } else {
+ throw;
+ }
+ }
hangBeforeSendingCommitDecision.pauseWhileSet();
@@ -293,8 +309,21 @@ void MigrationCoordinator::_abortMigrationOnDonorAndRecipient(OperationContext*
"recipientShardId"_attr = _migrationInfo.getRecipientShardId(),
"currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
"error"_attr = exShardNotFound);
+ } catch (const ExceptionFor<ErrorCodes::TransactionTooOld>& ex) {
+ // TODO: SERVER-62316: No longer catch after 6.0 branches out
+ if (_recoveringMigration) {
+ LOGV2_WARNING(6224501,
+ "Transaction number on recipient shard was already advanced by a later "
+ "migration that started before this one finished recovery",
+ "namespace"_attr = _migrationInfo.getNss(),
+ "migrationId"_attr = _migrationInfo.getId(),
+ "lsid"_attr = _migrationInfo.getLsid(),
+ "currentTxnNumber"_attr = _migrationInfo.getTxnNumber(),
+ "error"_attr = redact(ex));
+ } else {
+ throw;
+ }
}
-
LOGV2_DEBUG(23902,
2,
"Marking range deletion task on recipient as ready for processing",
diff --git a/src/mongo/db/s/migration_coordinator.h b/src/mongo/db/s/migration_coordinator.h
index 0c96d1eab15..88db1a68008 100644
--- a/src/mongo/db/s/migration_coordinator.h
+++ b/src/mongo/db/s/migration_coordinator.h
@@ -125,6 +125,8 @@ private:
MigrationCoordinatorDocument _migrationInfo;
bool _waitForDelete = false;
boost::optional<ExecutorFuture<void>> _releaseRecipientCriticalSectionFuture;
+ const bool _recoveringMigration =
+ false; // TODO: SERVER-62316: Can be removed after 6.0 branches out
};
} // namespace migrationutil
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp
index 61fe9f728eb..fa60f7990a8 100644
--- a/src/mongo/db/s/migration_util.cpp
+++ b/src/mongo/db/s/migration_util.cpp
@@ -816,7 +816,12 @@ void markAsReadyRangeDeletionTaskLocally(OperationContext* opCtx, const UUID& mi
auto update = BSON("$unset" << BSON(RangeDeletionTask::kPendingFieldName << ""));
hangInReadyRangeDeletionLocallyInterruptible.pauseWhileSet(opCtx);
- store.update(opCtx, query, update);
+ try {
+ store.update(opCtx, query, update);
+ } catch (const ExceptionFor<ErrorCodes::NoMatchingDocument>&) {
+ // If we are recovering the migration, the range-deletion may have already finished. So its
+ // associated document may already have been removed.
+ }
if (hangInReadyRangeDeletionLocallyThenSimulateErrorUninterruptible.shouldFail()) {
hangInReadyRangeDeletionLocallyThenSimulateErrorUninterruptible.pauseWhileSet(opCtx);
@@ -884,16 +889,6 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) {
store.forEach(opCtx,
BSONObj{},
[&opCtx, &unfinishedMigrationsCount](const MigrationCoordinatorDocument& doc) {
- // MigrationCoordinators are only created under the MigrationBlockingGuard,
- // which means that only one can possibly exist on an instance at a time.
- // Furthermore, recovery of an incomplete MigrationCoordator also acquires the
- // MigrationBlockingGuard. Because of this it is not possible to have more
- // than one unfinished migration.
- invariant(unfinishedMigrationsCount == 0,
- str::stream()
- << "Upon step-up a second migration coordinator was found"
- << redact(doc.toBSON()));
-
unfinishedMigrationsCount++;
LOGV2_DEBUG(4798511,
3,
@@ -908,14 +903,8 @@ void resumeMigrationCoordinationsOnStepUp(OperationContext* opCtx) {
CollectionShardingRuntime::get(opCtx, nss)->clearFilteringMetadata(opCtx);
}
- auto mbg = std::make_shared<MigrationBlockingGuard>(
- opCtx,
- str::stream() << "Recovery of migration session "
- << doc.getMigrationSessionId().toString()
- << " on collection " << nss);
-
ExecutorFuture<void>(getMigrationUtilExecutor(opCtx->getServiceContext()))
- .then([serviceContext = opCtx->getServiceContext(), nss, mbg] {
+ .then([serviceContext = opCtx->getServiceContext(), nss] {
ThreadClient tc("TriggerMigrationRecovery", serviceContext);
{
stdx::lock_guard<Client> lk(*tc.get());