summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@10gen.com>2016-11-18 16:39:03 -0500
committerDianna Hohensee <dianna.hohensee@10gen.com>2016-12-01 12:46:59 -0500
commitee64e4fa00f59fe83522772244ebcf773158d7a4 (patch)
tree65a87d91b434be9c15276a2b6b14de41134df7e0
parent4fd718e43cd9e73cbfc8874ef219a0fb785ab0ff (diff)
downloadmongo-ee64e4fa00f59fe83522772244ebcf773158d7a4.tar.gz
SERVER-26849 wait for balancer thread to terminate before finishing shutdown
(cherry picked from commit a80723362643fb8a3ee0a3eb3f118f35dcac49ea)
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp12
-rw-r--r--src/mongo/db/s/balancer/balancer.cpp18
-rw-r--r--src/mongo/db/s/balancer/balancer.h24
3 files changed, 32 insertions, 22 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 0d08717d69a..9413d1219af 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -412,11 +412,11 @@ Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(Operati
void ReplicationCoordinatorExternalStateImpl::onDrainComplete(OperationContext* txn) {
invariant(!txn->lockState()->isLocked());
- // If this is a config server node becoming a primary, start the balancer
+ // If this is a config server node becoming a primary, ensure the balancer is ready to start.
if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
- // We need to join the balancer here, because it might have been running at a previous time
- // when this node was a primary.
- Balancer::get(txn)->onDrainComplete(txn);
+ // We must ensure the balancer has stopped because it may still be in the process of
+ // stopping if this node was previously primary.
+ Balancer::get(txn)->waitForBalancerToStop();
}
}
@@ -707,7 +707,7 @@ void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationCon
void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() {
if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
- Balancer::get(getGlobalServiceContext())->onStepDownFromPrimary();
+ Balancer::get(getGlobalServiceContext())->interruptBalancer();
}
ShardingState::get(getGlobalServiceContext())->markCollectionsNotShardedAtStepdown();
@@ -775,7 +775,7 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook
distLockManager->unlockAll(txn, distLockManager->getProcessID());
// If this is a config server node becoming a primary, start the balancer
- Balancer::get(txn)->onTransitionToPrimary(txn);
+ Balancer::get(txn)->initiateBalancer(txn);
} else if (ShardingState::get(txn)->enabled()) {
const auto configsvrConnStr =
Grid::get(txn)->shardRegistry()->getConfigShard()->getConnString();
diff --git a/src/mongo/db/s/balancer/balancer.cpp b/src/mongo/db/s/balancer/balancer.cpp
index 51f06910946..c90ecb53aae 100644
--- a/src/mongo/db/s/balancer/balancer.cpp
+++ b/src/mongo/db/s/balancer/balancer.cpp
@@ -53,6 +53,7 @@
#include "mongo/s/shard_util.h"
#include "mongo/s/sharding_raii.h"
#include "mongo/stdx/memory.h"
+#include "mongo/util/exit.h"
#include "mongo/util/log.h"
#include "mongo/util/timer.h"
#include "mongo/util/version.h"
@@ -166,6 +167,15 @@ Balancer::~Balancer() {
void Balancer::create(ServiceContext* serviceContext) {
invariant(!getBalancer(serviceContext));
getBalancer(serviceContext) = stdx::make_unique<Balancer>(serviceContext);
+
+ // Register a shutdown task to terminate the balancer thread so that it doesn't leak memory.
+ registerShutdownTask([serviceContext] {
+ auto balancer = Balancer::get(serviceContext);
+ // Make sure that the balancer thread has been interrupted.
+ balancer->interruptBalancer();
+ // Make sure the balancer thread has terminated.
+ balancer->waitForBalancerToStop();
+ });
}
Balancer* Balancer::get(ServiceContext* serviceContext) {
@@ -176,7 +186,7 @@ Balancer* Balancer::get(OperationContext* operationContext) {
return get(operationContext->getServiceContext());
}
-void Balancer::onTransitionToPrimary(OperationContext* txn) {
+void Balancer::initiateBalancer(OperationContext* txn) {
stdx::lock_guard<stdx::mutex> scopedLock(_mutex);
invariant(_state == kStopped);
_state = kRunning;
@@ -188,7 +198,7 @@ void Balancer::onTransitionToPrimary(OperationContext* txn) {
_thread = stdx::thread([this] { _mainThread(); });
}
-void Balancer::onStepDownFromPrimary() {
+void Balancer::interruptBalancer() {
stdx::lock_guard<stdx::mutex> scopedLock(_mutex);
if (_state != kRunning)
return;
@@ -211,9 +221,7 @@ void Balancer::onStepDownFromPrimary() {
_condVar.notify_all();
}
-void Balancer::onDrainComplete(OperationContext* txn) {
- invariant(!txn->lockState()->isLocked());
-
+void Balancer::waitForBalancerToStop() {
{
stdx::lock_guard<stdx::mutex> scopedLock(_mutex);
if (_state == kStopped)
diff --git a/src/mongo/db/s/balancer/balancer.h b/src/mongo/db/s/balancer/balancer.h
index d625dc64cda..d8847a4ee90 100644
--- a/src/mongo/db/s/balancer/balancer.h
+++ b/src/mongo/db/s/balancer/balancer.h
@@ -76,35 +76,37 @@ public:
/**
* Invoked when the config server primary enters the 'PRIMARY' state and is invoked while the
* caller is holding the global X lock. Kicks off the main balancer thread and returns
- * immediately.
+ * immediately. Auto-balancing (if enabled) should commence shortly, and manual migrations will
+ * be processed and run.
*
* Must only be called if the balancer is in the stopped state (i.e., just constructed or
- * onDrainComplete has been called before). Any code in this call must not try to acquire any
- * locks or to wait on operations, which acquire locks.
+ * waitForBalancerToStop has been called before). Any code in this call must not try to acquire
+ * any locks or to wait on operations, which acquire locks.
*/
- void onTransitionToPrimary(OperationContext* txn);
+ void initiateBalancer(OperationContext* txn);
/**
* Invoked when this node which is currently serving as a 'PRIMARY' steps down and is invoked
* while the global X lock is held. Requests the main balancer thread to stop and returns
- * immediately without waiting for it to terminate.
+ * immediately without waiting for it to terminate. Once the balancer has stopped, manual
+ * migrations will be rejected.
*
* This method might be called multiple times in succession, which is what happens as a result
* of incomplete transition to primary so it is resilient to that.
*
- * The onDrainComplete method must be called afterwards in order to wait for the main balancer
- * thread to terminate and to allow onTransitionToPrimary to be called again.
+ * The waitForBalancerToStop method must be called afterwards in order to wait for the main
+ * balancer thread to terminate and to allow initiateBalancer to be called again.
*/
- void onStepDownFromPrimary();
+ void interruptBalancer();
/**
* Invoked when a node on its way to becoming a primary finishes draining and is about to
* acquire the global X lock in order to allow writes. Waits for the balancer thread to
- * terminate and primes the balancer so that onTransitionToPrimary can be called.
+ * terminate and primes the balancer so that initiateBalancer can be called.
*
- * This method is called without any locks held.
+ * This must not be called while holding any locks!
*/
- void onDrainComplete(OperationContext* txn);
+ void waitForBalancerToStop();
/**
* Potentially blocking method, which will return immediately if the balancer is not running a