summaryrefslogtreecommitdiff
path: root/src/mongo/db/s/resharding/resharding_metrics.cpp
diff options
context:
space:
mode:
authorBrett Nawrocki <brett.nawrocki@mongodb.com>2021-09-29 20:03:54 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-18 19:00:21 +0000
commit8797d34dae0f520c2cb19ff5c40ee3835ef0cc4c (patch)
treea6facbe1650a57353b066c53ef349ace0a3e1064 /src/mongo/db/s/resharding/resharding_metrics.cpp
parent6b406da5f066f6c69478f7934cdb83c0fa5ca7cd (diff)
downloadmongo-8797d34dae0f520c2cb19ff5c40ee3835ef0cc4c.tar.gz
SERVER-59927 Add retry to _restoreMetrics()r5.1.0-rc1
RecipientStateMachine::_restoreMetrics() performs a number of read operations to calculate the number of documents it cloned, oplog entries it fetched, and oplog entries it applied at the beginning of starting to run again. These read operations may be interrupted if the primary steps down shortly after having been stepped up, which eventually leads to an fassert(). Therefore, perform _restoreMetrics() in a resharding::WithAutomaticRetry() block so any transient errors can be automatically retried and synchronized with the stepdown token being canceled. Furthermore, refactor RecipientStateMachine to use new RetryingCancelableOperationContextFactory to ensure that all usages of CancelableOperationContextFactory occur within a resharding::WithAutomaticRetry() block. Additionally, add a test case that will cover the _restoreMetrics() read operations being interrupted. (cherry picked from commit b9e2784da82fef8e45b95b88e4ac1443649a5b0c)
Diffstat (limited to 'src/mongo/db/s/resharding/resharding_metrics.cpp')
-rw-r--r--src/mongo/db/s/resharding/resharding_metrics.cpp37
1 files changed, 18 insertions, 19 deletions
diff --git a/src/mongo/db/s/resharding/resharding_metrics.cpp b/src/mongo/db/s/resharding/resharding_metrics.cpp
index 8ba5e591e73..be231bce447 100644
--- a/src/mongo/db/s/resharding/resharding_metrics.cpp
+++ b/src/mongo/db/s/resharding/resharding_metrics.cpp
@@ -44,6 +44,7 @@ namespace mongo {
namespace {
constexpr auto kAnotherOperationInProgress = "Another operation is in progress";
constexpr auto kNoOperationInProgress = "No operation is in progress";
+constexpr auto kMetricsSetBeforeRestore = "Expected metrics to be 0 prior to restore";
constexpr auto kTotalOps = "countReshardingOperations";
constexpr auto kSuccessfulOps = "countReshardingSuccessful";
@@ -564,16 +565,10 @@ void ReshardingMetrics::onDocumentsCopied(int64_t documents, int64_t bytes) noex
invariant(checkState(*_currentOp->recipientState,
{RecipientStateEnum::kCloning, RecipientStateEnum::kError}));
- onDocumentsCopiedForCurrentOp(documents, bytes);
- _cumulativeOp->documentsCopied += documents;
- _cumulativeOp->bytesCopied += bytes;
-}
-
-void ReshardingMetrics::onDocumentsCopiedForCurrentOp(int64_t documents, int64_t bytes) noexcept {
- invariant(_currentOp, kNoOperationInProgress);
-
_currentOp->documentsCopied += documents;
_currentOp->bytesCopied += bytes;
+ _cumulativeOp->documentsCopied += documents;
+ _cumulativeOp->bytesCopied += bytes;
}
void ReshardingMetrics::gotInserts(int n) noexcept {
@@ -654,14 +649,8 @@ void ReshardingMetrics::onOplogEntriesFetched(int64_t entries) noexcept {
*_currentOp->recipientState,
{RecipientStateEnum::kCloning, RecipientStateEnum::kApplying, RecipientStateEnum::kError}));
- onOplogEntriesFetchedForCurrentOp(entries);
- _cumulativeOp->oplogEntriesFetched += entries;
-}
-
-void ReshardingMetrics::onOplogEntriesFetchedForCurrentOp(int64_t entries) noexcept {
- invariant(_currentOp, kNoOperationInProgress);
-
_currentOp->oplogEntriesFetched += entries;
+ _cumulativeOp->oplogEntriesFetched += entries;
}
void ReshardingMetrics::onOplogEntriesApplied(int64_t entries) noexcept {
@@ -672,14 +661,24 @@ void ReshardingMetrics::onOplogEntriesApplied(int64_t entries) noexcept {
invariant(checkState(*_currentOp->recipientState,
{RecipientStateEnum::kApplying, RecipientStateEnum::kError}));
- onOplogEntriesAppliedForCurrentOp(entries);
+ _currentOp->oplogEntriesApplied += entries;
_cumulativeOp->oplogEntriesApplied += entries;
}
-void ReshardingMetrics::onOplogEntriesAppliedForCurrentOp(int64_t entries) noexcept {
+void ReshardingMetrics::restoreForCurrentOp(int64_t documentCountCopied,
+ int64_t documentBytesCopied,
+ int64_t oplogEntriesFetched,
+ int64_t oplogEntriesApplied) noexcept {
invariant(_currentOp, kNoOperationInProgress);
-
- _currentOp->oplogEntriesApplied += entries;
+ invariant(_currentOp->documentsCopied == 0, kMetricsSetBeforeRestore);
+ invariant(_currentOp->bytesCopied == 0, kMetricsSetBeforeRestore);
+ invariant(_currentOp->oplogEntriesFetched == 0, kMetricsSetBeforeRestore);
+ invariant(_currentOp->oplogEntriesApplied == 0, kMetricsSetBeforeRestore);
+
+ _currentOp->documentsCopied = documentCountCopied;
+ _currentOp->bytesCopied = documentBytesCopied;
+ _currentOp->oplogEntriesFetched = oplogEntriesFetched;
+ _currentOp->oplogEntriesApplied = oplogEntriesApplied;
}
void ReshardingMetrics::onWriteDuringCriticalSection(int64_t writes) noexcept {