diff options
author | Cheahuychou Mao <mao.cheahuychou@gmail.com> | 2021-02-09 21:14:45 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-02-10 23:05:01 +0000 |
commit | 913c4be3ed219d83f19e0311c312fda5b3ca186e (patch) | |
tree | b4a92f5bfa8c1ded407c69d45886703817bb74a4 /src | |
parent | 36c670a00152ed2af43f1cf1e7ac9bebb6d0e2f0 (diff) | |
download | mongo-913c4be3ed219d83f19e0311c312fda5b3ca186e.tar.gz |
SERVER-54425 Make TenantMigrationDonorAccessBlocker keep track of number of blocked reads and writes and tenant migration errors thrown
Diffstat (limited to 'src')
11 files changed, 99 insertions, 58 deletions
diff --git a/src/mongo/db/commands/write_commands/write_commands.cpp b/src/mongo/db/commands/write_commands/write_commands.cpp index ff84f0897e1..33f4f719367 100644 --- a/src/mongo/db/commands/write_commands/write_commands.cpp +++ b/src/mongo/db/commands/write_commands/write_commands.cpp @@ -307,6 +307,7 @@ boost::optional<BSONObj> generateError(OperationContext* opCtx, auto migrationStatus = mtab->waitUntilCommittedOrAborted(opCtx, migrationConflictInfo->getOperationType()); + mtab->recordTenantMigrationError(migrationStatus); error.append("code", static_cast<int>(migrationStatus.code())); // We want to append an empty errmsg for the errors after the first one, so let the diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp index a419e3150d9..3b31c7f1b17 100644 --- a/src/mongo/db/op_observer_impl.cpp +++ b/src/mongo/db/op_observer_impl.cpp @@ -1232,7 +1232,8 @@ void OpObserverImpl::onUnpreparedTransactionCommit(OperationContext* opCtx, // Throw TenantMigrationConflict error if the database for the transaction statements is being // migrated. We only need check the namespace of the first statement since a transaction's // statements must all be for the same tenant. - tenant_migration_access_blocker::onWriteToDatabase(opCtx, statements->begin()->getNss().db()); + tenant_migration_access_blocker::checkIfCanWriteOrThrow(opCtx, + statements->begin()->getNss().db()); if (MONGO_unlikely(hangAndFailUnpreparedCommitAfterReservingOplogSlot.shouldFail())) { hangAndFailUnpreparedCommitAfterReservingOplogSlot.pauseWhileSet(opCtx); diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index 251153830c5..dba72ba9275 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -296,7 +296,7 @@ void _logOpsInner(OperationContext* opCtx, }); if (!isAbortIndexBuild) { - tenant_migration_access_blocker::onWriteToDatabase(opCtx, nss.db()); + tenant_migration_access_blocker::checkIfCanWriteOrThrow(opCtx, nss.db()); } else if (records->size() > 1) { str::stream ss; ss << "abortIndexBuild cannot be logged with other oplog entries "; diff --git a/src/mongo/db/repl/tenant_migration_access_blocker.h b/src/mongo/db/repl/tenant_migration_access_blocker.h index e88cb155d6d..e14c0d2f456 100644 --- a/src/mongo/db/repl/tenant_migration_access_blocker.h +++ b/src/mongo/db/repl/tenant_migration_access_blocker.h @@ -55,11 +55,11 @@ public: // Called by all writes and reads against the database. // - virtual void checkIfCanWriteOrThrow() = 0; + virtual Status checkIfCanWrite() = 0; virtual Status waitUntilCommittedOrAborted(OperationContext* opCtx, OperationType operationType) = 0; - virtual void checkIfLinearizableReadWasAllowedOrThrow(OperationContext* opCtx) = 0; + virtual Status checkIfLinearizableReadWasAllowed(OperationContext* opCtx) = 0; virtual SharedSemiFuture<void> getCanReadFuture(OperationContext* opCtx) = 0; // @@ -79,8 +79,16 @@ public: virtual void appendInfoForServerStatus(BSONObjBuilder* builder) const = 0; - // Returns structured info with current tenant ID and connection string. + /** + * Returns structured info with tenant id and connection string. + */ virtual BSONObj getDebugInfo() const = 0; + + /** + * Updates the runtime statistics for the number of tenant migration errors that have been + * thrown based on the given status. + */ + virtual void recordTenantMigrationError(Status status) = 0; }; } // namespace mongo diff --git a/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp b/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp index fe1e4e2d73f..36573d870d4 100644 --- a/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp +++ b/src/mongo/db/repl/tenant_migration_access_blocker_util.cpp @@ -137,7 +137,9 @@ void checkIfCanReadOrBlock(OperationContext* opCtx, StringData dbName) { // Optimisation: if the future is already ready, we are done. if (futures[0].isReady()) { - futures[0].get(); // Throw if error. + auto status = futures[0].getNoThrow(); + mtab->recordTenantMigrationError(status); + uassertStatusOK(status); return; } @@ -153,6 +155,7 @@ void checkIfCanReadOrBlock(OperationContext* opCtx, StringData dbName) { if (idx == 0) { // Read unblock condition finished first. cancelTimeoutSource.cancel(); + mtab->recordTenantMigrationError(status); uassertStatusOK(status); } else if (idx == 1) { // Deadline finished first, throw error. @@ -167,17 +170,21 @@ void checkIfLinearizableReadWasAllowedOrThrow(OperationContext* opCtx, StringDat repl::ReadConcernLevel::kLinearizableReadConcern) { if (auto mtab = TenantMigrationAccessBlockerRegistry::get(opCtx->getServiceContext()) .getTenantMigrationAccessBlockerForDbName(dbName)) { - mtab->checkIfLinearizableReadWasAllowedOrThrow(opCtx); + auto status = mtab->checkIfLinearizableReadWasAllowed(opCtx); + mtab->recordTenantMigrationError(status); + uassertStatusOK(status); } } } -void onWriteToDatabase(OperationContext* opCtx, StringData dbName) { +void checkIfCanWriteOrThrow(OperationContext* opCtx, StringData dbName) { auto mtab = TenantMigrationAccessBlockerRegistry::get(opCtx->getServiceContext()) .getTenantMigrationAccessBlockerForDbName(dbName); if (mtab) { - mtab->checkIfCanWriteOrThrow(); + auto status = mtab->checkIfCanWrite(); + mtab->recordTenantMigrationError(status); + uassertStatusOK(status); } } @@ -188,6 +195,7 @@ Status checkIfCanBuildIndex(OperationContext* opCtx, StringData dbName) { if (mtab) { // This log is included for synchronization of the tenant migration buildindex jstests. auto status = mtab->checkIfCanBuildIndex(); + mtab->recordTenantMigrationError(status); LOGV2_DEBUG(4886202, 1, "Checked if tenant migration on database prevents index builds", @@ -256,8 +264,10 @@ void handleTenantMigrationConflict(OperationContext* opCtx, Status status) { invariant(migrationConflictInfo); auto mtab = migrationConflictInfo->getTenantMigrationAccessBlocker(); invariant(mtab); - uassertStatusOK( - mtab->waitUntilCommittedOrAborted(opCtx, migrationConflictInfo->getOperationType())); + auto migrationStatus = + mtab->waitUntilCommittedOrAborted(opCtx, migrationConflictInfo->getOperationType()); + mtab->recordTenantMigrationError(migrationStatus); + uassertStatusOK(migrationStatus); } void performNoopWrite(OperationContext* opCtx, StringData msg) { diff --git a/src/mongo/db/repl/tenant_migration_access_blocker_util.h b/src/mongo/db/repl/tenant_migration_access_blocker_util.h index cf338366e5a..a0cf4a57c8d 100644 --- a/src/mongo/db/repl/tenant_migration_access_blocker_util.h +++ b/src/mongo/db/repl/tenant_migration_access_blocker_util.h @@ -74,7 +74,7 @@ void checkIfLinearizableReadWasAllowedOrThrow(OperationContext* opCtx, StringDat * Throws TenantMigrationConflict if the database is being migrated and the migration is in the * blocking state. Throws TenantMigrationCommitted if it is in committed. */ -void onWriteToDatabase(OperationContext* opCtx, StringData dbName); +void checkIfCanWriteOrThrow(OperationContext* opCtx, StringData dbName); /** * Returns TenantMigrationConflict if the database is being migrated (even if migration is not yet diff --git a/src/mongo/db/repl/tenant_migration_donor_access_blocker.cpp b/src/mongo/db/repl/tenant_migration_donor_access_blocker.cpp index 3b1ccf096b4..9789ca823ea 100644 --- a/src/mongo/db/repl/tenant_migration_donor_access_blocker.cpp +++ b/src/mongo/db/repl/tenant_migration_donor_access_blocker.cpp @@ -48,9 +48,6 @@ namespace mongo { namespace { -MONGO_FAIL_POINT_DEFINE(tenantMigrationBlockRead); -MONGO_FAIL_POINT_DEFINE(tenantMigrationBlockWrite); - const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max()); } // namespace @@ -64,21 +61,20 @@ TenantMigrationDonorAccessBlocker::TenantMigrationDonorAccessBlocker( .getOrCreateBlockedOperationsExecutor(); } -void TenantMigrationDonorAccessBlocker::checkIfCanWriteOrThrow() { +Status TenantMigrationDonorAccessBlocker::checkIfCanWrite() { stdx::lock_guard<Latch> lg(_mutex); switch (_state) { case State::kAllow: - return; case State::kAborted: - return; + return Status::OK(); case State::kBlockWrites: case State::kBlockWritesAndReads: - uasserted(TenantMigrationConflictInfo(_tenantId, shared_from_this()), - "Write must block until this tenant migration commits or aborts"); + return {TenantMigrationConflictInfo(_tenantId, shared_from_this()), + "Write must block until this tenant migration commits or aborts"}; case State::kReject: - uasserted(TenantMigrationCommittedInfo(_tenantId, _recipientConnString), - "Write must be re-routed to the new owner of this tenant"); + return {TenantMigrationCommittedInfo(_tenantId, _recipientConnString), + "Write must be re-routed to the new owner of this tenant"}; default: MONGO_UNREACHABLE; } @@ -86,19 +82,6 @@ void TenantMigrationDonorAccessBlocker::checkIfCanWriteOrThrow() { Status TenantMigrationDonorAccessBlocker::waitUntilCommittedOrAborted(OperationContext* opCtx, OperationType operationType) { - { - stdx::unique_lock<Latch> ul(_mutex); - - auto canWrite = [&]() { - return (operationType == kWrite && _state == State::kAllow) || - _state == State::kAborted; - }; - - if (!canWrite()) { - tenantMigrationBlockWrite.shouldFail(); // Return value intentionally ignored. - } - } - // Source to cancel the timeout if the operation completed in time. CancelationSource cancelTimeoutSource; auto executor = getAsyncBlockingOperationsExecutor(); @@ -160,28 +143,29 @@ SharedSemiFuture<void> TenantMigrationDonorAccessBlocker::_getCanDoClusterTimeRe auto canRead = _state == State::kAllow || _state == State::kAborted || _state == State::kBlockWrites || readTimestamp < *_blockTimestamp; - if (!canRead) { - tenantMigrationBlockRead.shouldFail(); // Return value intentionally ignored. - } if (canRead) { return SharedSemiFuture<void>(); } + if (_state == State::kReject) { return SharedSemiFuture<void>( Status(ErrorCodes::TenantMigrationCommitted, - "Write or read must be re-routed to the new owner of this tenant", + "Read must be re-routed to the new owner of this tenant", TenantMigrationCommittedInfo(_tenantId, _recipientConnString).toBSON())); } + _stats.numBlockedReads.addAndFetch(1); return _transitionOutOfBlockingPromise.getFuture(); } -void TenantMigrationDonorAccessBlocker::checkIfLinearizableReadWasAllowedOrThrow( +Status TenantMigrationDonorAccessBlocker::checkIfLinearizableReadWasAllowed( OperationContext* opCtx) { stdx::lock_guard<Latch> lg(_mutex); - uassert(TenantMigrationCommittedInfo(_tenantId, _recipientConnString), - "Read must be re-routed to the new owner of this tenant", - _state != State::kReject); + if (_state == State::kReject) { + return {TenantMigrationCommittedInfo(_tenantId, _recipientConnString), + "Read must be re-routed to the new owner of this tenant"}; + } + return Status::OK(); } Status TenantMigrationDonorAccessBlocker::checkIfCanBuildIndex() { @@ -369,6 +353,7 @@ void TenantMigrationDonorAccessBlocker::appendInfoForServerStatus(BSONObjBuilder if (_abortOpTime) { tenantBuilder.append("abortOpTime", _abortOpTime->toBSON()); } + _stats.report(&tenantBuilder); builder->append(_tenantId, tenantBuilder.obj()); } @@ -393,4 +378,21 @@ BSONObj TenantMigrationDonorAccessBlocker::getDebugInfo() const { return BSON("tenantId" << _tenantId << "recipientConnectionString" << _recipientConnString); } +void TenantMigrationDonorAccessBlocker::recordTenantMigrationError(Status status) { + if (status == ErrorCodes::TenantMigrationConflict) { + _stats.numBlockedWrites.addAndFetch(1); + } else if (status == ErrorCodes::TenantMigrationCommitted) { + _stats.numTenantMigrationCommittedErrors.addAndFetch(1); + } else if (status == ErrorCodes::TenantMigrationAborted) { + _stats.numTenantMigrationAbortedErrors.addAndFetch(1); + } +} + +void TenantMigrationDonorAccessBlocker::Stats::report(BSONObjBuilder* builder) const { + builder->append("numBlockedReads", numBlockedReads.load()); + builder->append("numBlockedWrites", numBlockedWrites.load()); + builder->append("numTenantMigrationCommittedErrors", numTenantMigrationCommittedErrors.load()); + builder->append("numTenantMigrationAbortedErrors", numTenantMigrationAbortedErrors.load()); +} + } // namespace mongo diff --git a/src/mongo/db/repl/tenant_migration_donor_access_blocker.h b/src/mongo/db/repl/tenant_migration_donor_access_blocker.h index a9ec4adf9d2..88969a16f68 100644 --- a/src/mongo/db/repl/tenant_migration_donor_access_blocker.h +++ b/src/mongo/db/repl/tenant_migration_donor_access_blocker.h @@ -113,7 +113,7 @@ inline RepeatableSharedPromise<void>::~RepeatableSharedPromise() { * } * } * - * Writes call checkIfCanWriteOrThrow after being assigned an OpTime but before committing. The + * Writes call checkIfCanWrite after being assigned an OpTime but before committing. The * method throws TenantMigrationConflict if writes are being blocked, which is caught in the loop. * The write then blocks until the migration either commits (in which case checkIfCanWriteOrBlock * throws an error that causes the write to be rejected) or aborts (in which case @@ -140,10 +140,10 @@ inline RepeatableSharedPromise<void>::~RepeatableSharedPromise() { * Timestamp will be the "blockTimestamp". * * At this point: - * - Writes that have already passed checkIfCanWriteOrThrow must have been assigned an OpTime before + * - Writes that have already passed checkIfCanWrite must have been assigned an OpTime before * the blockTimestamp, since the blockTimestamp hasn't been assigned yet, and OpTimes are handed * out in monotonically increasing order. - * - Writes that have not yet passed checkIfCanWriteOrThrow will end up blocking. Some of these + * - Writes that have not yet passed checkIfCanWrite will end up blocking. Some of these * writes may have already been assigned an OpTime, or may end up being assigned an OpTime that is * before the blockTimestamp, and so will end up blocking unnecessarily, but not incorrectly. * @@ -185,10 +185,10 @@ public: // Called by all writes and reads against the database. // - void checkIfCanWriteOrThrow() final; + Status checkIfCanWrite() final; Status waitUntilCommittedOrAborted(OperationContext* opCtx, OperationType operationType) final; - void checkIfLinearizableReadWasAllowedOrThrow(OperationContext* opCtx) final; + Status checkIfLinearizableReadWasAllowed(OperationContext* opCtx) final; SharedSemiFuture<void> getCanReadFuture(OperationContext* opCtx) final; // @@ -210,9 +210,10 @@ public: void appendInfoForServerStatus(BSONObjBuilder* builder) const final; - // Returns structured info with current tenant ID and connection string. BSONObj getDebugInfo() const final; + void recordTenantMigrationError(Status status) final; + // // Called while donating this database. // @@ -240,6 +241,23 @@ private: enum class State { kAllow, kBlockWrites, kBlockWritesAndReads, kReject, kAborted }; std::string _stateToString(State state) const; + /** + * Encapsulates runtime statistics on blocked reads and writes, and tenant migration errors + * thrown. + */ + struct Stats { + AtomicWord<long long> numBlockedReads; + AtomicWord<long long> numBlockedWrites; + AtomicWord<long long> numTenantMigrationCommittedErrors; + AtomicWord<long long> numTenantMigrationAbortedErrors; + + /** + * Reports the accumulated statistics for serverStatus. + */ + void report(BSONObjBuilder* builder) const; + + } _stats; + SharedSemiFuture<void> _onCompletion() { return _completionPromise.getFuture(); } diff --git a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp index 6f551162c4b..6fdec9a3208 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp @@ -54,10 +54,10 @@ TenantMigrationRecipientAccessBlocker::TenantMigrationRecipientAccessBlocker( .getOrCreateBlockedOperationsExecutor(); } -void TenantMigrationRecipientAccessBlocker::checkIfCanWriteOrThrow() { +Status TenantMigrationRecipientAccessBlocker::checkIfCanWrite() { // This is guaranteed by the migration protocol. The recipient will not get any writes until the // migration is committed on the donor. - return; + return Status::OK(); } Status TenantMigrationRecipientAccessBlocker::waitUntilCommittedOrAborted( @@ -109,14 +109,14 @@ SharedSemiFuture<void> TenantMigrationRecipientAccessBlocker::getCanReadFuture( return SharedSemiFuture<void>(); } -void TenantMigrationRecipientAccessBlocker::checkIfLinearizableReadWasAllowedOrThrow( +Status TenantMigrationRecipientAccessBlocker::checkIfLinearizableReadWasAllowed( OperationContext* opCtx) { // The donor will block all writes at the blockOpTime, and will not signal the proxy to allow // reading from the recipient until that blockOpTime is majority committed on the recipient. // This means any writes made on the donor set are available in the majority snapshot of the // recipient, so linearizable guarantees will hold using the existing linearizable read // mechanism of doing a no-op write and waiting for it to be majority committed. - return; + return Status::OK(); } Status TenantMigrationRecipientAccessBlocker::checkIfCanBuildIndex() { diff --git a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h index 896cc957925..099b2b78a25 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h +++ b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h @@ -77,10 +77,10 @@ public: // Called by all writes and reads against the database. // - void checkIfCanWriteOrThrow() final; + Status checkIfCanWrite() final; Status waitUntilCommittedOrAborted(OperationContext* opCtx, OperationType operationType) final; - void checkIfLinearizableReadWasAllowedOrThrow(OperationContext* opCtx) final; + Status checkIfLinearizableReadWasAllowed(OperationContext* opCtx) final; SharedSemiFuture<void> getCanReadFuture(OperationContext* opCtx) final; // @@ -100,9 +100,10 @@ public: void appendInfoForServerStatus(BSONObjBuilder* builder) const final; - // Returns structured info with current tenant ID and connection string. BSONObj getDebugInfo() const final; + void recordTenantMigrationError(Status status) final{}; + // // Called as a recipient to reject reads before the `timestamp`. // diff --git a/src/mongo/db/repl/tenant_migration_recipient_access_blocker_test.cpp b/src/mongo/db/repl/tenant_migration_recipient_access_blocker_test.cpp index 5cb27284e70..aeb5e23b60a 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_access_blocker_test.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_access_blocker_test.cpp @@ -120,8 +120,8 @@ TEST_F(TenantMigrationRecipientAccessBlockerTest, NoopFunctions) { getServiceContext(), getTenantId(), getDonorConnectionString()); // These functions are noop functions and should not throw even in reject state. - mtab.checkIfCanWriteOrThrow(); - mtab.checkIfLinearizableReadWasAllowedOrThrow(opCtx()); + ASSERT_OK(mtab.checkIfCanWrite()); + ASSERT_OK(mtab.checkIfLinearizableReadWasAllowed(opCtx())); ASSERT_OK(mtab.checkIfCanBuildIndex()); } |