diff options
author | Jack Mulrow <jack.mulrow@mongodb.com> | 2021-03-15 16:10:13 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-05-17 15:31:29 +0000 |
commit | 6ad2f864751c61924551b7f50a9eb9e9d654c020 (patch) | |
tree | 34d646c3dbf8886194b6b880d57678e65aadf3f8 /src | |
parent | 97bd3bc33d6bee92c5a780b525113b700e3b629a (diff) | |
download | mongo-6ad2f864751c61924551b7f50a9eb9e9d654c020.tar.gz |
SERVER-53407 Ensure tenant migrations provide causal consistency with a lagged donor
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/repl/tenant_migration_recipient_service.cpp | 42 | ||||
-rw-r--r-- | src/mongo/db/repl/tenant_migration_recipient_service.h | 3 |
2 files changed, 28 insertions, 17 deletions
diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.cpp b/src/mongo/db/repl/tenant_migration_recipient_service.cpp index 2925b683c69..356284f323d 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_service.cpp @@ -54,6 +54,7 @@ #include "mongo/db/repl/repl_client_info.h" #include "mongo/db/repl/repl_server_parameters_gen.h" #include "mongo/db/repl/replication_auth.h" +#include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/tenant_migration_access_blocker_util.h" #include "mongo/db/repl/tenant_migration_decoration.h" #include "mongo/db/repl/tenant_migration_recipient_entry_helpers.h" @@ -419,6 +420,29 @@ OpTime TenantMigrationRecipientService::Instance::waitUntilMigrationReachesConsi return _dataConsistentPromise.getFuture().get(opCtx); } +Timestamp selectRejectReadsBeforeTimestamp(OperationContext* opCtx, + const Timestamp& returnAfterReachingTimestamp, + const OpTime& oplogApplierOpTime) { + // Don't allow reading before the opTime timestamp of the final write on the recipient + // associated with cloning the donor's data so the client can't see an inconsistent state. The + // oplog applier timestamp may be null if no oplog entries were copied, but data may still have + // been cloned, so use the last applied opTime in that case. + // + // Note the cloning writes happen on a separate thread, but the last applied opTime in the + // replication coordinator is guaranteed to be inclusive of those writes because this function + // is called after waiting for the _dataConsistentPromise to resolve, which happens after the + // last write for cloning completes (and all of its WUOW onCommit() handlers). + auto finalRecipientWriteTimestamp = oplogApplierOpTime.getTimestamp().isNull() + ? ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime().getTimestamp() + : oplogApplierOpTime.getTimestamp(); + + // Also don't allow reading before the returnAfterReachingTimestamp (aka the blockTimestamp) to + // prevent readers from possibly seeing data in a point in time snapshot on the recipient that + // would not have been seen at the same point in time on the donor if the donor's cluster time + // is ahead of the recipient's. + return std::max(finalRecipientWriteTimestamp, returnAfterReachingTimestamp); +} + OpTime TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterReachingTimestamp( OperationContext* opCtx, const Timestamp& returnAfterReachingTimestamp) { @@ -427,18 +451,6 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR // waits on _dataConsistentPromise. _dataConsistentPromise.getFuture().get(opCtx); - { - stdx::lock_guard lk(_mutex); - if (_stateDoc.getRejectReadsBeforeTimestamp()) { - uassert( - ErrorCodes::IllegalOperation, - str::stream() << "Received a conflicting returnAfterReachingTimestamp, received: " - << returnAfterReachingTimestamp.toBSON() << " expected: " - << _stateDoc.getRejectReadsBeforeTimestamp()->toBSON(), - returnAfterReachingTimestamp == *_stateDoc.getRejectReadsBeforeTimestamp()); - } - } - auto getWaitOpTimeFuture = [&]() { stdx::unique_lock lk(_mutex); // In the event of a donor failover, it is possible that a new donor has stepped up and @@ -487,6 +499,7 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR } uassertStatusOK(status); + auto& donorRecipientOpTimePair = swDonorRecipientOpTimePair.getValue(); // Make sure that the recipient logical clock has advanced to at least the donor timestamp // before returning success for recipientSyncData. @@ -495,7 +508,8 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR { stdx::lock_guard lk(_mutex); - _stateDoc.setRejectReadsBeforeTimestamp(returnAfterReachingTimestamp); + _stateDoc.setRejectReadsBeforeTimestamp(selectRejectReadsBeforeTimestamp( + opCtx, returnAfterReachingTimestamp, donorRecipientOpTimePair.recipientOpTime)); } _stopOrHangOnFailPoint(&fpBeforePersistingRejectReadsBeforeTimestamp, opCtx); uassertStatusOK(tenantMigrationRecipientEntryHelpers::updateStateDoc(opCtx, _stateDoc)); @@ -509,7 +523,7 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR _stopOrHangOnFailPoint(&fpAfterWaitForRejectReadsBeforeTimestamp, opCtx); - return swDonorRecipientOpTimePair.getValue().donorOpTime; + return donorRecipientOpTimePair.donorOpTime; } std::unique_ptr<DBClientConnection> TenantMigrationRecipientService::Instance::_connectAndAuth( diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.h b/src/mongo/db/repl/tenant_migration_recipient_service.h index 62502d29dfe..5c9f375b59f 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service.h +++ b/src/mongo/db/repl/tenant_migration_recipient_service.h @@ -161,9 +161,6 @@ public: * document and waits for the write to be replicated to every node (i.e. wait for * 'rejectReadsBeforeTimestamp' to be set on the TenantMigrationRecipientAccessBlocker of * every node) to guarantee that no reads will be incorrectly accepted. - * - * Throws IllegalOperation if the state document already has 'rejectReadsBeforeTimestamp' - * that is not equal to 'returnAfterReachingTimestamp', and on other error. */ OpTime waitUntilMigrationReachesReturnAfterReachingTimestamp( OperationContext* opCtx, const Timestamp& returnAfterReachingTimestamp); |