SERVER-67370 Destroy instance metrics object so it's lifetime will not be tied to the future callback function object

author: Randolph Tan <randolph@10gen.com> 2022-06-23 19:16:24 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-06-23 20:10:18 +0000
commit: 1b1d8538d0dab776ecccadcecc64f6d7eaebb5e7 (patch)
tree: 444f1a074eca671513dbd005484093db6ea9c337
parent: 7ad5fbf86418d2a6f17d553b5888198c5f9de05a (diff)
download: mongo-1b1d8538d0dab776ecccadcecc64f6d7eaebb5e7.tar.gz
3 files changed, 22 insertions, 5 deletions
diff --git a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
index fe15da0a5ca..9aa5ed7c223 100644
--- a/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_coordinator_service.cpp
@@ -1399,6 +1399,14 @@ SemiFuture<void> ReshardingCoordinatorService::ReshardingCoordinator::run(
                 .onCompletion([outerStatus](Status) { return outerStatus; });
         })
         .onCompletion([this, self = shared_from_this()](Status status) {
+            _metrics->onStateTransition(toMetricsState(_coordinatorDoc.getState()), boost::none);
+
+            // Destroy metrics early so it's lifetime will not be tied to the lifetime of this
+            // state machine. This is because we have future callbacks copy shared pointers to this
+            // state machine that causes it to live longer than expected and potentially overlap
+            // with a newer instance when stepping up.
+            _metrics.reset();
+
             if (!status.isOK()) {
                 {
                     auto lg = stdx::lock_guard(_fulfillmentMutex);
@@ -1412,8 +1420,6 @@ SemiFuture<void> ReshardingCoordinatorService::ReshardingCoordinator::run(
                 }
                 _reshardingCoordinatorObserver->interrupt(status);
             }
-
-            _metrics->onStateTransition(toMetricsState(_coordinatorDoc.getState()), boost::none);
         })
         .semi();
 }
diff --git a/src/mongo/db/s/resharding/resharding_donor_service.cpp b/src/mongo/db/s/resharding/resharding_donor_service.cpp
index b25b25185ce..40b1f17f179 100644
--- a/src/mongo/db/s/resharding/resharding_donor_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_donor_service.cpp
@@ -414,6 +414,14 @@ ExecutorFuture<void> ReshardingDonorService::DonorStateMachine::_finishReshardin
 
 Status ReshardingDonorService::DonorStateMachine::_runMandatoryCleanup(
     Status status, const CancellationToken& stepdownToken) {
+    _metrics->onStateTransition(toMetricsState(_donorCtx.getState()), boost::none);
+
+    // Destroy metrics early so it's lifetime will not be tied to the lifetime of this state
+    // machine. This is because we have future callbacks copy shared pointers to this state machine
+    // that causes it to live longer than expected and potentially overlap with a newer instance
+    // when stepping up.
+    _metrics.reset();
+
     if (!status.isOK()) {
         // If the stepdownToken was triggered, it takes priority in order to make sure that
         // the promise is set with an error that can be retried with. If it ran into an
@@ -431,8 +439,6 @@ Status ReshardingDonorService::DonorStateMachine::_runMandatoryCleanup(
         ensureFulfilledPromise(lk, _completionPromise, statusForPromise);
     }
 
-    _metrics->onStateTransition(toMetricsState(_donorCtx.getState()), boost::none);
-
     return status;
 }
 
diff --git a/src/mongo/db/s/resharding/resharding_recipient_service.cpp b/src/mongo/db/s/resharding/resharding_recipient_service.cpp
index f52e3b8a3a7..5b66d19e8bd 100644
--- a/src/mongo/db/s/resharding/resharding_recipient_service.cpp
+++ b/src/mongo/db/s/resharding/resharding_recipient_service.cpp
@@ -421,6 +421,12 @@ ExecutorFuture<void> ReshardingRecipientService::RecipientStateMachine::_runMand
                        isCanceled = stepdownToken.isCanceled()](Status dataReplicationHaltStatus) {
             _metrics->onStateTransition(toMetricsState(_recipientCtx.getState()), boost::none);
 
+            // Destroy metrics early so it's lifetime will not be tied to the lifetime of this
+            // state machine. This is because we have future callbacks copy shared pointers to this
+            // state machine that causes it to live longer than expected and potentially overlap
+            // with a newer instance when stepping up.
+            _metrics.reset();
+
             // If the stepdownToken was triggered, it takes priority in order to make sure that
             // the promise is set with an error that the coordinator can retry with. If it ran into
             // an unrecoverable error, it would have fasserted earlier.
@@ -434,7 +440,6 @@ ExecutorFuture<void> ReshardingRecipientService::RecipientStateMachine::_runMand
             // replication errors because resharding is known to have failed already.
             stdx::lock_guard<Latch> lk(_mutex);
             ensureFulfilledPromise(lk, _completionPromise, outerStatus);
-
             return outerStatus;
         });
 }
author	Randolph Tan <randolph@10gen.com>	2022-06-23 19:16:24 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-06-23 20:10:18 +0000
commit	1b1d8538d0dab776ecccadcecc64f6d7eaebb5e7 (patch)
tree	444f1a074eca671513dbd005484093db6ea9c337
parent	7ad5fbf86418d2a6f17d553b5888198c5f9de05a (diff)
download	mongo-1b1d8538d0dab776ecccadcecc64f6d7eaebb5e7.tar.gz