diff options
author | Dianna Hohensee <dianna.hohensee@10gen.com> | 2016-11-18 16:39:03 -0500 |
---|---|---|
committer | Dianna Hohensee <dianna.hohensee@10gen.com> | 2016-12-01 12:46:59 -0500 |
commit | ee64e4fa00f59fe83522772244ebcf773158d7a4 (patch) | |
tree | 65a87d91b434be9c15276a2b6b14de41134df7e0 | |
parent | 4fd718e43cd9e73cbfc8874ef219a0fb785ab0ff (diff) | |
download | mongo-ee64e4fa00f59fe83522772244ebcf773158d7a4.tar.gz |
SERVER-26849 wait for balancer thread to terminate before finishing shutdown
(cherry picked from commit a80723362643fb8a3ee0a3eb3f118f35dcac49ea)
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_external_state_impl.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/balancer.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/balancer.h | 24 |
3 files changed, 32 insertions, 22 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 0d08717d69a..9413d1219af 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -412,11 +412,11 @@ Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(Operati void ReplicationCoordinatorExternalStateImpl::onDrainComplete(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); - // If this is a config server node becoming a primary, start the balancer + // If this is a config server node becoming a primary, ensure the balancer is ready to start. if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { - // We need to join the balancer here, because it might have been running at a previous time - // when this node was a primary. - Balancer::get(txn)->onDrainComplete(txn); + // We must ensure the balancer has stopped because it may still be in the process of + // stopping if this node was previously primary. + Balancer::get(txn)->waitForBalancerToStop(); } } @@ -707,7 +707,7 @@ void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationCon void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { - Balancer::get(getGlobalServiceContext())->onStepDownFromPrimary(); + Balancer::get(getGlobalServiceContext())->interruptBalancer(); } ShardingState::get(getGlobalServiceContext())->markCollectionsNotShardedAtStepdown(); @@ -775,7 +775,7 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook distLockManager->unlockAll(txn, distLockManager->getProcessID()); // If this is a config server node becoming a primary, start the balancer - Balancer::get(txn)->onTransitionToPrimary(txn); + Balancer::get(txn)->initiateBalancer(txn); } else if (ShardingState::get(txn)->enabled()) { const auto configsvrConnStr = Grid::get(txn)->shardRegistry()->getConfigShard()->getConnString(); diff --git a/src/mongo/db/s/balancer/balancer.cpp b/src/mongo/db/s/balancer/balancer.cpp index 51f06910946..c90ecb53aae 100644 --- a/src/mongo/db/s/balancer/balancer.cpp +++ b/src/mongo/db/s/balancer/balancer.cpp @@ -53,6 +53,7 @@ #include "mongo/s/shard_util.h" #include "mongo/s/sharding_raii.h" #include "mongo/stdx/memory.h" +#include "mongo/util/exit.h" #include "mongo/util/log.h" #include "mongo/util/timer.h" #include "mongo/util/version.h" @@ -166,6 +167,15 @@ Balancer::~Balancer() { void Balancer::create(ServiceContext* serviceContext) { invariant(!getBalancer(serviceContext)); getBalancer(serviceContext) = stdx::make_unique<Balancer>(serviceContext); + + // Register a shutdown task to terminate the balancer thread so that it doesn't leak memory. + registerShutdownTask([serviceContext] { + auto balancer = Balancer::get(serviceContext); + // Make sure that the balancer thread has been interrupted. + balancer->interruptBalancer(); + // Make sure the balancer thread has terminated. + balancer->waitForBalancerToStop(); + }); } Balancer* Balancer::get(ServiceContext* serviceContext) { @@ -176,7 +186,7 @@ Balancer* Balancer::get(OperationContext* operationContext) { return get(operationContext->getServiceContext()); } -void Balancer::onTransitionToPrimary(OperationContext* txn) { +void Balancer::initiateBalancer(OperationContext* txn) { stdx::lock_guard<stdx::mutex> scopedLock(_mutex); invariant(_state == kStopped); _state = kRunning; @@ -188,7 +198,7 @@ void Balancer::onTransitionToPrimary(OperationContext* txn) { _thread = stdx::thread([this] { _mainThread(); }); } -void Balancer::onStepDownFromPrimary() { +void Balancer::interruptBalancer() { stdx::lock_guard<stdx::mutex> scopedLock(_mutex); if (_state != kRunning) return; @@ -211,9 +221,7 @@ void Balancer::onStepDownFromPrimary() { _condVar.notify_all(); } -void Balancer::onDrainComplete(OperationContext* txn) { - invariant(!txn->lockState()->isLocked()); - +void Balancer::waitForBalancerToStop() { { stdx::lock_guard<stdx::mutex> scopedLock(_mutex); if (_state == kStopped) diff --git a/src/mongo/db/s/balancer/balancer.h b/src/mongo/db/s/balancer/balancer.h index d625dc64cda..d8847a4ee90 100644 --- a/src/mongo/db/s/balancer/balancer.h +++ b/src/mongo/db/s/balancer/balancer.h @@ -76,35 +76,37 @@ public: /** * Invoked when the config server primary enters the 'PRIMARY' state and is invoked while the * caller is holding the global X lock. Kicks off the main balancer thread and returns - * immediately. + * immediately. Auto-balancing (if enabled) should commence shortly, and manual migrations will + * be processed and run. * * Must only be called if the balancer is in the stopped state (i.e., just constructed or - * onDrainComplete has been called before). Any code in this call must not try to acquire any - * locks or to wait on operations, which acquire locks. + * waitForBalancerToStop has been called before). Any code in this call must not try to acquire + * any locks or to wait on operations, which acquire locks. */ - void onTransitionToPrimary(OperationContext* txn); + void initiateBalancer(OperationContext* txn); /** * Invoked when this node which is currently serving as a 'PRIMARY' steps down and is invoked * while the global X lock is held. Requests the main balancer thread to stop and returns - * immediately without waiting for it to terminate. + * immediately without waiting for it to terminate. Once the balancer has stopped, manual + * migrations will be rejected. * * This method might be called multiple times in succession, which is what happens as a result * of incomplete transition to primary so it is resilient to that. * - * The onDrainComplete method must be called afterwards in order to wait for the main balancer - * thread to terminate and to allow onTransitionToPrimary to be called again. + * The waitForBalancerToStop method must be called afterwards in order to wait for the main + * balancer thread to terminate and to allow initiateBalancer to be called again. */ - void onStepDownFromPrimary(); + void interruptBalancer(); /** * Invoked when a node on its way to becoming a primary finishes draining and is about to * acquire the global X lock in order to allow writes. Waits for the balancer thread to - * terminate and primes the balancer so that onTransitionToPrimary can be called. + * terminate and primes the balancer so that initiateBalancer can be called. * - * This method is called without any locks held. + * This must not be called while holding any locks! */ - void onDrainComplete(OperationContext* txn); + void waitForBalancerToStop(); /** * Potentially blocking method, which will return immediately if the balancer is not running a |