summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-03-11 18:09:33 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-10 19:12:40 +0000
commit98be6002347d03644c7a68259ac5b0930edc6ba7 (patch)
treedf538bb86b452099143e00bd2cceacab188a8048
parentc01d93166f824ecc0aeb57d3f02003d833769fa5 (diff)
downloadmongo-98be6002347d03644c7a68259ac5b0930edc6ba7.tar.gz
SERVER-39458 Continuously drain side writes while waiting for next index build action
(cherry picked from commit ca49965d8d68cc853e466ba741df08bb248d46f6)
-rw-r--r--jstests/noPassthrough/index_build_continuous_drain_secondary.js90
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp7
-rw-r--r--src/mongo/db/index_builds_coordinator.h8
-rw-r--r--src/mongo/db/index_builds_coordinator_mongod.cpp59
-rw-r--r--src/mongo/db/index_builds_coordinator_mongod.h3
-rw-r--r--src/mongo/embedded/index_builds_coordinator_embedded.h5
6 files changed, 158 insertions, 14 deletions
diff --git a/jstests/noPassthrough/index_build_continuous_drain_secondary.js b/jstests/noPassthrough/index_build_continuous_drain_secondary.js
new file mode 100644
index 00000000000..edfbc35db22
--- /dev/null
+++ b/jstests/noPassthrough/index_build_continuous_drain_secondary.js
@@ -0,0 +1,90 @@
+/**
+ * Tests that secondaries drain side writes while waiting for the primary to commit an index build.
+ *
+ * This test does not make very many correctness assertions because this exercises a performance
+ * optimization. Instead we log the time difference between how long the primary and secondary took
+ * to complete the index builds. The expectation is that these values are close to each other.
+ *
+ * @tags: [requires_replication]
+ *
+ */
+(function() {
+load("jstests/noPassthrough/libs/index_build.js");
+
+const replSet = new ReplSetTest({
+ nodes: [
+ {},
+ {
+ // Disallow elections on secondary.
+ rsConfig: {
+ priority: 0,
+ votes: 0,
+ },
+ },
+ ]
+});
+
+replSet.startSet();
+replSet.initiate();
+
+const primary = replSet.getPrimary();
+if (!IndexBuildTest.supportsTwoPhaseIndexBuild(primary)) {
+ jsTestLog('Skipping test because two phase index builds are not supported.');
+ replSet.stopSet();
+ return;
+}
+
+const dbName = 'test';
+const primaryDB = primary.getDB(dbName);
+const coll = primaryDB.test;
+
+let insertDocs = function(numDocs) {
+ const bulk = coll.initializeUnorderedBulkOp();
+ for (let i = 0; i < numDocs; i++) {
+ bulk.insert({a: i, b: i});
+ }
+ assert.commandWorked(bulk.execute());
+};
+insertDocs(10000);
+replSet.awaitReplication();
+
+// Start and pause the index build on the primary so that it does not start collection scanning.
+IndexBuildTest.pauseIndexBuilds(primary);
+const createIdx = IndexBuildTest.startIndexBuild(primary, coll.getFullName(), {a: 1, b: 1});
+
+const secondary = replSet.getSecondary();
+const secondaryDB = secondary.getDB(dbName);
+
+// Wait until the secondary reports that it is ready to commit.
+// "Index build waiting for next action before completing final phase"
+checkLog.containsJson(secondary, 3856203);
+
+// Insert a high volume of documents. Since the secondary has reported that it is ready to commit,
+// the expectation is that the secondary will intercept and drain these writes as they are
+// replicated from primary.
+insertDocs(50000);
+// "index build: drained side writes"
+checkLog.containsJson(secondary, 20689);
+
+// Record how long it takes for the index build to complete from this point onward.
+let start = new Date();
+IndexBuildTest.resumeIndexBuilds(primary);
+
+// Wait for index build to finish on primary.
+createIdx();
+let primaryEnd = new Date();
+
+// Wait for the index build to complete on the secondary.
+IndexBuildTest.waitForIndexBuildToStop(secondaryDB);
+let secondaryEnd = new Date();
+
+// We don't make any assertions about these times, just report them for informational purposes. The
+// expectation is that they are as close to each other as possible, which would suggest that the
+// secondary does not spend more time completing the index than the primary.
+jsTestLog("these values should be similar:");
+jsTestLog("elapsed on primary: " + (primaryEnd - start));
+jsTestLog("elapsed on secondary: " + (secondaryEnd - start));
+
+IndexBuildTest.assertIndexes(coll, 2, ['_id_', 'a_1_b_1']);
+replSet.stopSet();
+})();
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index 93e5c2c6c18..896f6930aa9 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -2000,6 +2000,7 @@ void IndexBuildsCoordinator::_buildIndexSinglePhase(
boost::optional<Lock::CollectionLock>* exclusiveCollectionLock) {
_scanCollectionAndInsertKeysIntoSorter(opCtx, replState, exclusiveCollectionLock);
_insertKeysFromSideTablesWithoutBlockingWrites(opCtx, replState);
+ _insertKeysFromSideTablesBlockingWrites(opCtx, replState);
_signalPrimaryForCommitReadiness(opCtx, replState);
_waitForNextIndexBuildAction(opCtx, replState);
_insertKeysFromSideTablesAndCommit(
@@ -2014,6 +2015,7 @@ void IndexBuildsCoordinator::_buildIndexTwoPhase(
_scanCollectionAndInsertKeysIntoSorter(opCtx, replState, exclusiveCollectionLock);
_insertKeysFromSideTablesWithoutBlockingWrites(opCtx, replState);
+ _insertKeysFromSideTablesBlockingWrites(opCtx, replState);
_signalPrimaryForCommitReadiness(opCtx, replState);
auto commitIndexBuildTimestamp = _waitForNextIndexBuildAction(opCtx, replState);
@@ -2091,7 +2093,10 @@ void IndexBuildsCoordinator::_insertKeysFromSideTablesWithoutBlockingWrites(
LOGV2(20666, "Hanging after index build first drain");
hangAfterIndexBuildFirstDrain.pauseWhileSet();
}
-
+}
+void IndexBuildsCoordinator::_insertKeysFromSideTablesBlockingWrites(
+ OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) {
+ const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
// Perform the second drain while stopping writes on the collection.
{
opCtx->recoveryUnit()->abandonSnapshot();
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index 9f9a250ea0d..20b4e44e544 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -611,6 +611,8 @@ protected:
*/
void _insertKeysFromSideTablesWithoutBlockingWrites(
OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState);
+ void _insertKeysFromSideTablesBlockingWrites(OperationContext* opCtx,
+ std::shared_ptr<ReplIndexBuildState> replState);
/**
* Reads the commit ready members list for index build UUID in 'replState' from
@@ -640,6 +642,12 @@ protected:
OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) = 0;
/**
+ * Drains the side-writes table periodically while waiting for the IndexBuildAction to be ready.
+ */
+ virtual IndexBuildAction _drainSideWritesUntilNextActionIsAvailable(
+ OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) = 0;
+
+ /**
* Both primary and secondaries will wait on 'ReplIndexBuildState::waitForNextAction' future for
* commit or abort index build signal.
* On primary:
diff --git a/src/mongo/db/index_builds_coordinator_mongod.cpp b/src/mongo/db/index_builds_coordinator_mongod.cpp
index 47c26e4b53a..7a75f2d61f4 100644
--- a/src/mongo/db/index_builds_coordinator_mongod.cpp
+++ b/src/mongo/db/index_builds_coordinator_mongod.cpp
@@ -610,27 +610,57 @@ void IndexBuildsCoordinatorMongod::_signalPrimaryForCommitReadiness(
return;
}
+IndexBuildAction IndexBuildsCoordinatorMongod::_drainSideWritesUntilNextActionIsAvailable(
+ OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) {
+ auto future = [&] {
+ stdx::unique_lock<Latch> lk(replState->mutex);
+ invariant(replState->waitForNextAction);
+ return replState->waitForNextAction->getFuture();
+ }();
+
+ // Waits until the promise is fulfilled or the deadline expires.
+ IndexBuildAction nextAction;
+ auto waitUntilNextActionIsReady = [&]() {
+ // Don't perform a blocking wait while holding locks or storage engine resources.
+ opCtx->recoveryUnit()->abandonSnapshot();
+ Lock::TempRelease release(opCtx->lockState());
+
+ auto deadline = Date_t::now() + Milliseconds(1000);
+ auto timeoutError = opCtx->getTimeoutError();
+
+ try {
+ nextAction =
+ opCtx->runWithDeadline(deadline, timeoutError, [&] { return future.get(opCtx); });
+ } catch (const ExceptionForCat<ErrorCategory::ExceededTimeLimitError>& e) {
+ if (e.code() == timeoutError) {
+ return false;
+ }
+ throw;
+ }
+ return true;
+ };
+
+ // Continuously drain incoming writes until the future is ready. This is an optimization that
+ // allows the critical section of committing, which must drain the remainder of the side writes,
+ // to be as short as possible.
+ while (!waitUntilNextActionIsReady()) {
+ _insertKeysFromSideTablesWithoutBlockingWrites(opCtx, replState);
+ }
+ return nextAction;
+}
+
Timestamp IndexBuildsCoordinatorMongod::_waitForNextIndexBuildAction(
OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) {
Timestamp commitIndexBuildTimestamp;
- // Yield locks and storage engine resources before blocking.
- opCtx->recoveryUnit()->abandonSnapshot();
- Lock::TempRelease release(opCtx->lockState());
-
LOGV2(3856203,
"Index build waiting for next action before completing final phase: {buildUUID}",
"buildUUID"_attr = replState->buildUUID);
while (true) {
- // Future wait should ignore state transition.
- invariant(!opCtx->lockState()->isRSTLLocked(),
- str::stream()
- << "failed to yield locks for index build while waiting for commit or abort: "
- << replState->buildUUID);
-
- // future wait should get interrupted if the node shutdowns.
- const auto nextAction = replState->waitForNextAction->getFuture().get(opCtx);
+ // Future wait can be interrupted. This function will yield locks while waiting for the
+ // future to be fulfilled.
+ const auto nextAction = _drainSideWritesUntilNextActionIsAvailable(opCtx, replState);
LOGV2(3856204,
"Index build received signal for build uuid: {buildUUID} , action: {action}",
"buildUUID"_attr = replState->buildUUID,
@@ -638,8 +668,11 @@ Timestamp IndexBuildsCoordinatorMongod::_waitForNextIndexBuildAction(
bool needsToRetryWait = false;
- // Reacquire RSTL lock
+ // Ensure RSTL is acquired before checking replication state. This is only necessary for
+ // single-phase builds on secondaries. Everywhere else, the RSTL is already held and this is
+ // should never block.
repl::ReplicationStateTransitionLockGuard rstl(opCtx, MODE_IX);
+
const NamespaceStringOrUUID dbAndUUID(replState->dbName, replState->collectionUUID);
auto replCoord = repl::ReplicationCoordinator::get(opCtx);
auto isMaster = replCoord->canAcceptWritesFor(opCtx, dbAndUUID);
diff --git a/src/mongo/db/index_builds_coordinator_mongod.h b/src/mongo/db/index_builds_coordinator_mongod.h
index 7596a4ddb97..6c2a89a7c72 100644
--- a/src/mongo/db/index_builds_coordinator_mongod.h
+++ b/src/mongo/db/index_builds_coordinator_mongod.h
@@ -154,6 +154,9 @@ private:
void _signalPrimaryForCommitReadiness(OperationContext* opCtx,
std::shared_ptr<ReplIndexBuildState> replState) override;
+ IndexBuildAction _drainSideWritesUntilNextActionIsAvailable(
+ OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) override;
+
Timestamp _waitForNextIndexBuildAction(OperationContext* opCtx,
std::shared_ptr<ReplIndexBuildState> replState) override;
diff --git a/src/mongo/embedded/index_builds_coordinator_embedded.h b/src/mongo/embedded/index_builds_coordinator_embedded.h
index 3ebe78c4e83..06f39bba115 100644
--- a/src/mongo/embedded/index_builds_coordinator_embedded.h
+++ b/src/mongo/embedded/index_builds_coordinator_embedded.h
@@ -92,6 +92,11 @@ private:
void _signalPrimaryForCommitReadiness(OperationContext* opCtx,
std::shared_ptr<ReplIndexBuildState> replState) override;
+ IndexBuildAction _drainSideWritesUntilNextActionIsAvailable(
+ OperationContext* opCtx, std::shared_ptr<ReplIndexBuildState> replState) {
+ return {};
+ };
+
Timestamp _waitForNextIndexBuildAction(OperationContext* opCtx,
std::shared_ptr<ReplIndexBuildState> replState) override;
};