summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGregory Wlodarek <gregory.wlodarek@mongodb.com>2020-04-01 18:08:04 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-02 18:30:02 +0000
commit8180323509e77006765bde544c8db0c3d8127317 (patch)
tree34db8c589aeddb02b99b7121ddc0d79b16c00808
parente6bf2e9f1e14913fdf5963d085a496328d0a52ca (diff)
downloadmongo-8180323509e77006765bde544c8db0c3d8127317.tar.gz
SERVER-47155 Limit the number of simultaneous index builds running from user connections to 3
(cherry picked from commit c7818dde1a910b6828f24e1e9670688bdd5e5a68)
-rw-r--r--src/mongo/db/index_builds_coordinator_mongod.cpp89
-rw-r--r--src/mongo/db/index_builds_coordinator_mongod.h7
-rw-r--r--src/mongo/db/storage/two_phase_index_build_knobs.idl15
3 files changed, 101 insertions, 10 deletions
diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp
index 00dd98d8428..dcdd85191ba 100644
--- a/src/mongo/db/index_builds_coordinator_mongod.cpp
+++ b/src/mongo/db/index_builds_coordinator_mongod.cpp
@@ -60,6 +60,8 @@ namespace {
MONGO_FAIL_POINT_DEFINE(hangBeforeInitializingIndexBuild);
MONGO_FAIL_POINT_DEFINE(hangAfterInitializingIndexBuild);
+const StringData kMaxNumActiveUserIndexBuildsServerParameterName = "maxNumActiveUserIndexBuilds"_sd;
+
/**
* Constructs the options for the loader thread pool.
*/
@@ -67,10 +69,13 @@ ThreadPool::Options makeDefaultThreadPoolOptions() {
ThreadPool::Options options;
options.poolName = "IndexBuildsCoordinatorMongod";
options.minThreads = 0;
- // We depend on thread pool sizes being equal between primaries and secondaries. If a secondary
- // has fewer resources than a primary, index build oplog entries can replicate in an order that
- // the secondary is unable to fulfill, leading to deadlocks. See SERVER-44250.
- options.maxThreads = 3;
+ // Both the primary and secondary nodes will have an unlimited thread pool size. This is done to
+ // allow secondary nodes to startup as many index builders as necessary in order to prevent
+ // scheduling deadlocks during initial sync or oplog application. When commands are run from
+ // user connections that need to create indexes, those commands will hang until there are less
+ // than 'maxNumActiveUserIndexBuilds' running index build threads, or until the operation is
+ // interrupted.
+ options.maxThreads = ThreadPool::Options::kUnlimited;
// Ensure all threads have a client.
options.onCreateThread = [](const std::string& threadName) {
@@ -85,10 +90,18 @@ ThreadPool::Options makeDefaultThreadPoolOptions() {
IndexBuildsCoordinatorMongod::IndexBuildsCoordinatorMongod()
: _threadPool(makeDefaultThreadPoolOptions()) {
_threadPool.startup();
-}
-IndexBuildsCoordinatorMongod::IndexBuildsCoordinatorMongod(ThreadPool::Options options)
- : _threadPool(std::move(options)) {
- _threadPool.startup();
+
+ // Change the 'setOnUpdate' function for the server parameter to signal the condition variable
+ // when the value changes.
+ ServerParameter* serverParam =
+ ServerParameterSet::getGlobal()->get(kMaxNumActiveUserIndexBuildsServerParameterName);
+ static_cast<
+ IDLServerParameterWithStorage<ServerParameterType::kStartupAndRuntime, AtomicWord<int>>*>(
+ serverParam)
+ ->setOnUpdate([this](const int) -> Status {
+ _indexBuildFinished.notify_all();
+ return Status::OK();
+ });
}
void IndexBuildsCoordinatorMongod::shutdown() {
@@ -110,6 +123,55 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx,
const UUID& buildUUID,
IndexBuildProtocol protocol,
IndexBuildOptions indexBuildOptions) {
+ const NamespaceStringOrUUID nssOrUuid{dbName, collectionUUID};
+
+ {
+ // Only operations originating from user connections need to wait while there are more than
+ // 'maxNumActiveUserIndexBuilds' index builds currently running.
+ if (opCtx->getClient()->isFromUserConnection()) {
+ // Need to follow the locking order here by getting the global lock first followed by
+ // the mutex. The global lock acquires the RSTL lock which we use to assert that we're
+ // the primary node when running user operations.
+ ShouldNotConflictWithSecondaryBatchApplicationBlock shouldNotConflictBlock(
+ opCtx->lockState());
+ Lock::GlobalLock globalLk(opCtx, MODE_IX);
+
+ stdx::unique_lock<Latch> lk(_mutex);
+
+ auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+ uassert(ErrorCodes::NotMaster,
+ "Not primary while waiting to start an index build",
+ replCoord->canAcceptWritesFor(opCtx, nssOrUuid));
+ opCtx->waitForConditionOrInterrupt(_indexBuildFinished, lk, [&] {
+ const int maxActiveBuilds = maxNumActiveUserIndexBuilds.load();
+ if (_numActiveIndexBuilds < maxActiveBuilds) {
+ _numActiveIndexBuilds++;
+ return true;
+ }
+
+ LOGV2(4715500,
+ "Too many index builds running simultaneously, waiting until the number of "
+ "active index builds is below the threshold",
+ "numActiveIndexBuilds"_attr = _numActiveIndexBuilds,
+ "maxNumActiveUserIndexBuilds"_attr = maxActiveBuilds,
+ "indexSpecs"_attr = specs,
+ "buildUUID"_attr = buildUUID,
+ "collectionUUID"_attr = collectionUUID);
+ return false;
+ });
+ } else {
+ // System index builds have no limit and never wait, but do consume a slot.
+ stdx::unique_lock<Latch> lk(_mutex);
+ _numActiveIndexBuilds++;
+ }
+ }
+
+ auto onScopeExitGuard = makeGuard([&] {
+ stdx::unique_lock<Latch> lk(_mutex);
+ _numActiveIndexBuilds--;
+ _indexBuildFinished.notify_one();
+ });
+
if (indexBuildOptions.twoPhaseRecovery) {
// Two phase index build recovery goes though a different set-up procedure because the
// original index will be dropped first.
@@ -150,7 +212,6 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx,
const auto deadline = opCtx->getDeadline();
const auto timeoutError = opCtx->getTimeoutError();
- const NamespaceStringOrUUID nssOrUuid{dbName, collectionUUID};
const auto nss = CollectionCatalog::get(opCtx).resolveNamespaceStringOrUUID(opCtx, nssOrUuid);
const auto& oss = OperationShardingState::get(opCtx);
@@ -179,6 +240,10 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx,
auto replState = invariant(_getIndexBuild(buildUUID));
+
+ // The thread pool task will be responsible for signalling the condition variable when the index
+ // build thread is done running.
+ onScopeExitGuard.dismiss();
_threadPool.schedule([
this,
buildUUID,
@@ -196,6 +261,12 @@ IndexBuildsCoordinatorMongod::startIndexBuild(OperationContext* opCtx,
shardVersion,
dbVersion
](auto status) mutable noexcept {
+ auto onScopeExitGuard = makeGuard([&] {
+ stdx::unique_lock<Latch> lk(_mutex);
+ _numActiveIndexBuilds--;
+ _indexBuildFinished.notify_one();
+ });
+
// Clean up if we failed to schedule the task.
if (!status.isOK()) {
stdx::unique_lock<Latch> lk(_mutex);
diff --git a/src/mongo/db/index_builds_coordinator_mongod.h b/src/mongo/db/index_builds_coordinator_mongod.h
index 8dd2335dd83..7596a4ddb97 100644
--- a/src/mongo/db/index_builds_coordinator_mongod.h
+++ b/src/mongo/db/index_builds_coordinator_mongod.h
@@ -55,7 +55,6 @@ public:
* Sets up the thread pool.
*/
IndexBuildsCoordinatorMongod();
- IndexBuildsCoordinatorMongod(ThreadPool::Options options);
/**
* Shuts down the thread pool, signals interrupt to all index builds, then waits for all of the
@@ -160,6 +159,12 @@ private:
// Thread pool on which index builds are run.
ThreadPool _threadPool;
+
+ // Protected by _mutex.
+ int _numActiveIndexBuilds = 0;
+
+ // Condition signalled to indicate that an index build thread finished executing.
+ stdx::condition_variable _indexBuildFinished;
};
} // namespace mongo
diff --git a/src/mongo/db/storage/two_phase_index_build_knobs.idl b/src/mongo/db/storage/two_phase_index_build_knobs.idl
index dbcdc45d1e3..c2a176c91f5 100644
--- a/src/mongo/db/storage/two_phase_index_build_knobs.idl
+++ b/src/mongo/db/storage/two_phase_index_build_knobs.idl
@@ -46,3 +46,18 @@ server_parameters:
cpp_vartype: bool
cpp_varname: "enableIndexBuildCommitQuorum"
default: true
+
+ maxNumActiveUserIndexBuilds:
+ description: >
+ Specifies the maximum number of active user index builds that can be built simultaneously on
+ the primary node. Index builds initiated by the system are not subject to this limitation.
+
+ Additionally, active index builds initiated by the system count towards the limit and can
+ delay scheduling user index builds even when the number of active user index builds is below
+ the limit.
+ set_at: [ startup, runtime ]
+ cpp_vartype: AtomicWord<int>
+ cpp_varname: maxNumActiveUserIndexBuilds
+ default: 3
+ validator:
+ gte: 0