diff options
author | Brett Nawrocki <brett.nawrocki@mongodb.com> | 2021-09-29 20:03:54 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-18 19:00:21 +0000 |
commit | 8797d34dae0f520c2cb19ff5c40ee3835ef0cc4c (patch) | |
tree | a6facbe1650a57353b066c53ef349ace0a3e1064 /src/mongo/db/s/resharding/resharding_metrics.cpp | |
parent | 6b406da5f066f6c69478f7934cdb83c0fa5ca7cd (diff) | |
download | mongo-8797d34dae0f520c2cb19ff5c40ee3835ef0cc4c.tar.gz |
SERVER-59927 Add retry to _restoreMetrics()r5.1.0-rc1
RecipientStateMachine::_restoreMetrics() performs a number of read
operations to calculate the number of documents it cloned, oplog entries
it fetched, and oplog entries it applied at the beginning of starting to
run again. These read operations may be interrupted if the primary steps
down shortly after having been stepped up, which eventually leads to an
fassert(). Therefore, perform _restoreMetrics() in a
resharding::WithAutomaticRetry() block so any transient errors can be
automatically retried and synchronized with the stepdown token being
canceled.
Furthermore, refactor RecipientStateMachine to use new
RetryingCancelableOperationContextFactory to ensure that all usages of
CancelableOperationContextFactory occur within a
resharding::WithAutomaticRetry() block.
Additionally, add a test case that will cover the _restoreMetrics() read
operations being interrupted.
(cherry picked from commit b9e2784da82fef8e45b95b88e4ac1443649a5b0c)
Diffstat (limited to 'src/mongo/db/s/resharding/resharding_metrics.cpp')
-rw-r--r-- | src/mongo/db/s/resharding/resharding_metrics.cpp | 37 |
1 files changed, 18 insertions, 19 deletions
diff --git a/src/mongo/db/s/resharding/resharding_metrics.cpp b/src/mongo/db/s/resharding/resharding_metrics.cpp index 8ba5e591e73..be231bce447 100644 --- a/src/mongo/db/s/resharding/resharding_metrics.cpp +++ b/src/mongo/db/s/resharding/resharding_metrics.cpp @@ -44,6 +44,7 @@ namespace mongo { namespace { constexpr auto kAnotherOperationInProgress = "Another operation is in progress"; constexpr auto kNoOperationInProgress = "No operation is in progress"; +constexpr auto kMetricsSetBeforeRestore = "Expected metrics to be 0 prior to restore"; constexpr auto kTotalOps = "countReshardingOperations"; constexpr auto kSuccessfulOps = "countReshardingSuccessful"; @@ -564,16 +565,10 @@ void ReshardingMetrics::onDocumentsCopied(int64_t documents, int64_t bytes) noex invariant(checkState(*_currentOp->recipientState, {RecipientStateEnum::kCloning, RecipientStateEnum::kError})); - onDocumentsCopiedForCurrentOp(documents, bytes); - _cumulativeOp->documentsCopied += documents; - _cumulativeOp->bytesCopied += bytes; -} - -void ReshardingMetrics::onDocumentsCopiedForCurrentOp(int64_t documents, int64_t bytes) noexcept { - invariant(_currentOp, kNoOperationInProgress); - _currentOp->documentsCopied += documents; _currentOp->bytesCopied += bytes; + _cumulativeOp->documentsCopied += documents; + _cumulativeOp->bytesCopied += bytes; } void ReshardingMetrics::gotInserts(int n) noexcept { @@ -654,14 +649,8 @@ void ReshardingMetrics::onOplogEntriesFetched(int64_t entries) noexcept { *_currentOp->recipientState, {RecipientStateEnum::kCloning, RecipientStateEnum::kApplying, RecipientStateEnum::kError})); - onOplogEntriesFetchedForCurrentOp(entries); - _cumulativeOp->oplogEntriesFetched += entries; -} - -void ReshardingMetrics::onOplogEntriesFetchedForCurrentOp(int64_t entries) noexcept { - invariant(_currentOp, kNoOperationInProgress); - _currentOp->oplogEntriesFetched += entries; + _cumulativeOp->oplogEntriesFetched += entries; } void ReshardingMetrics::onOplogEntriesApplied(int64_t entries) noexcept { @@ -672,14 +661,24 @@ void ReshardingMetrics::onOplogEntriesApplied(int64_t entries) noexcept { invariant(checkState(*_currentOp->recipientState, {RecipientStateEnum::kApplying, RecipientStateEnum::kError})); - onOplogEntriesAppliedForCurrentOp(entries); + _currentOp->oplogEntriesApplied += entries; _cumulativeOp->oplogEntriesApplied += entries; } -void ReshardingMetrics::onOplogEntriesAppliedForCurrentOp(int64_t entries) noexcept { +void ReshardingMetrics::restoreForCurrentOp(int64_t documentCountCopied, + int64_t documentBytesCopied, + int64_t oplogEntriesFetched, + int64_t oplogEntriesApplied) noexcept { invariant(_currentOp, kNoOperationInProgress); - - _currentOp->oplogEntriesApplied += entries; + invariant(_currentOp->documentsCopied == 0, kMetricsSetBeforeRestore); + invariant(_currentOp->bytesCopied == 0, kMetricsSetBeforeRestore); + invariant(_currentOp->oplogEntriesFetched == 0, kMetricsSetBeforeRestore); + invariant(_currentOp->oplogEntriesApplied == 0, kMetricsSetBeforeRestore); + + _currentOp->documentsCopied = documentCountCopied; + _currentOp->bytesCopied = documentBytesCopied; + _currentOp->oplogEntriesFetched = oplogEntriesFetched; + _currentOp->oplogEntriesApplied = oplogEntriesApplied; } void ReshardingMetrics::onWriteDuringCriticalSection(int64_t writes) noexcept { |