summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJack Mulrow <jack.mulrow@mongodb.com>2021-03-15 16:10:13 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-05-17 15:31:29 +0000
commit6ad2f864751c61924551b7f50a9eb9e9d654c020 (patch)
tree34d646c3dbf8886194b6b880d57678e65aadf3f8 /src
parent97bd3bc33d6bee92c5a780b525113b700e3b629a (diff)
downloadmongo-6ad2f864751c61924551b7f50a9eb9e9d654c020.tar.gz
SERVER-53407 Ensure tenant migrations provide causal consistency with a lagged donor
Diffstat (limited to 'src')
-rw-r--r--src/mongo/db/repl/tenant_migration_recipient_service.cpp42
-rw-r--r--src/mongo/db/repl/tenant_migration_recipient_service.h3
2 files changed, 28 insertions, 17 deletions
diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.cpp b/src/mongo/db/repl/tenant_migration_recipient_service.cpp
index 2925b683c69..356284f323d 100644
--- a/src/mongo/db/repl/tenant_migration_recipient_service.cpp
+++ b/src/mongo/db/repl/tenant_migration_recipient_service.cpp
@@ -54,6 +54,7 @@
#include "mongo/db/repl/repl_client_info.h"
#include "mongo/db/repl/repl_server_parameters_gen.h"
#include "mongo/db/repl/replication_auth.h"
+#include "mongo/db/repl/replication_coordinator.h"
#include "mongo/db/repl/tenant_migration_access_blocker_util.h"
#include "mongo/db/repl/tenant_migration_decoration.h"
#include "mongo/db/repl/tenant_migration_recipient_entry_helpers.h"
@@ -419,6 +420,29 @@ OpTime TenantMigrationRecipientService::Instance::waitUntilMigrationReachesConsi
return _dataConsistentPromise.getFuture().get(opCtx);
}
+Timestamp selectRejectReadsBeforeTimestamp(OperationContext* opCtx,
+ const Timestamp& returnAfterReachingTimestamp,
+ const OpTime& oplogApplierOpTime) {
+ // Don't allow reading before the opTime timestamp of the final write on the recipient
+ // associated with cloning the donor's data so the client can't see an inconsistent state. The
+ // oplog applier timestamp may be null if no oplog entries were copied, but data may still have
+ // been cloned, so use the last applied opTime in that case.
+ //
+ // Note the cloning writes happen on a separate thread, but the last applied opTime in the
+ // replication coordinator is guaranteed to be inclusive of those writes because this function
+ // is called after waiting for the _dataConsistentPromise to resolve, which happens after the
+ // last write for cloning completes (and all of its WUOW onCommit() handlers).
+ auto finalRecipientWriteTimestamp = oplogApplierOpTime.getTimestamp().isNull()
+ ? ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime().getTimestamp()
+ : oplogApplierOpTime.getTimestamp();
+
+ // Also don't allow reading before the returnAfterReachingTimestamp (aka the blockTimestamp) to
+ // prevent readers from possibly seeing data in a point in time snapshot on the recipient that
+ // would not have been seen at the same point in time on the donor if the donor's cluster time
+ // is ahead of the recipient's.
+ return std::max(finalRecipientWriteTimestamp, returnAfterReachingTimestamp);
+}
+
OpTime
TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterReachingTimestamp(
OperationContext* opCtx, const Timestamp& returnAfterReachingTimestamp) {
@@ -427,18 +451,6 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR
// waits on _dataConsistentPromise.
_dataConsistentPromise.getFuture().get(opCtx);
- {
- stdx::lock_guard lk(_mutex);
- if (_stateDoc.getRejectReadsBeforeTimestamp()) {
- uassert(
- ErrorCodes::IllegalOperation,
- str::stream() << "Received a conflicting returnAfterReachingTimestamp, received: "
- << returnAfterReachingTimestamp.toBSON() << " expected: "
- << _stateDoc.getRejectReadsBeforeTimestamp()->toBSON(),
- returnAfterReachingTimestamp == *_stateDoc.getRejectReadsBeforeTimestamp());
- }
- }
-
auto getWaitOpTimeFuture = [&]() {
stdx::unique_lock lk(_mutex);
// In the event of a donor failover, it is possible that a new donor has stepped up and
@@ -487,6 +499,7 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR
}
uassertStatusOK(status);
+ auto& donorRecipientOpTimePair = swDonorRecipientOpTimePair.getValue();
// Make sure that the recipient logical clock has advanced to at least the donor timestamp
// before returning success for recipientSyncData.
@@ -495,7 +508,8 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR
{
stdx::lock_guard lk(_mutex);
- _stateDoc.setRejectReadsBeforeTimestamp(returnAfterReachingTimestamp);
+ _stateDoc.setRejectReadsBeforeTimestamp(selectRejectReadsBeforeTimestamp(
+ opCtx, returnAfterReachingTimestamp, donorRecipientOpTimePair.recipientOpTime));
}
_stopOrHangOnFailPoint(&fpBeforePersistingRejectReadsBeforeTimestamp, opCtx);
uassertStatusOK(tenantMigrationRecipientEntryHelpers::updateStateDoc(opCtx, _stateDoc));
@@ -509,7 +523,7 @@ TenantMigrationRecipientService::Instance::waitUntilMigrationReachesReturnAfterR
_stopOrHangOnFailPoint(&fpAfterWaitForRejectReadsBeforeTimestamp, opCtx);
- return swDonorRecipientOpTimePair.getValue().donorOpTime;
+ return donorRecipientOpTimePair.donorOpTime;
}
std::unique_ptr<DBClientConnection> TenantMigrationRecipientService::Instance::_connectAndAuth(
diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.h b/src/mongo/db/repl/tenant_migration_recipient_service.h
index 62502d29dfe..5c9f375b59f 100644
--- a/src/mongo/db/repl/tenant_migration_recipient_service.h
+++ b/src/mongo/db/repl/tenant_migration_recipient_service.h
@@ -161,9 +161,6 @@ public:
* document and waits for the write to be replicated to every node (i.e. wait for
* 'rejectReadsBeforeTimestamp' to be set on the TenantMigrationRecipientAccessBlocker of
* every node) to guarantee that no reads will be incorrectly accepted.
- *
- * Throws IllegalOperation if the state document already has 'rejectReadsBeforeTimestamp'
- * that is not equal to 'returnAfterReachingTimestamp', and on other error.
*/
OpTime waitUntilMigrationReachesReturnAfterReachingTimestamp(
OperationContext* opCtx, const Timestamp& returnAfterReachingTimestamp);