summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Milkie <milkie@10gen.com>2020-04-06 14:45:23 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-08 20:39:40 +0000
commit56b3fda768b6009c78310893d88e929025137291 (patch)
tree43de73be5e36b6def5a40aa1999e4a51cb1a84b8
parent72e265eccbb6e49497eb386a0829ed75bae8c032 (diff)
downloadmongo-56b3fda768b6009c78310893d88e929025137291.tar.gz
SERVER-44507 retry hybrid index build drain mode until all prepared transactions on the collection have committed
-rw-r--r--jstests/replsets/index_build_stepdown_stepup_stepdown.js127
-rw-r--r--src/mongo/db/catalog/index_builds_manager.cpp5
-rw-r--r--src/mongo/db/catalog/index_builds_manager.h6
-rw-r--r--src/mongo/db/catalog/multi_index_block.cpp14
-rw-r--r--src/mongo/db/catalog/multi_index_block.h5
-rw-r--r--src/mongo/db/index/index_build_interceptor.cpp6
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp62
7 files changed, 197 insertions, 28 deletions
diff --git a/jstests/replsets/index_build_stepdown_stepup_stepdown.js b/jstests/replsets/index_build_stepdown_stepup_stepdown.js
new file mode 100644
index 00000000000..30e51806196
--- /dev/null
+++ b/jstests/replsets/index_build_stepdown_stepup_stepdown.js
@@ -0,0 +1,127 @@
+/*
+ * This test starts an index build, then through a series of failovers, demonstrates that the index
+ * build does not miss any writes from outstanding prepared transactions. This is an issue since
+ * prepared transactions do not use locks on secondaries, and thus can allow an index build to
+ * commit without waiting for the prepared transaction.
+ *
+ * @tags: [
+ * requires_document_locking,
+ * requires_replication,
+ * uses_prepare_transaction,
+ * uses_transactions,
+ * ]
+ */
+load("jstests/replsets/rslib.js");
+load("jstests/core/txns/libs/prepare_helpers.js");
+
+(function() {
+
+"use strict";
+
+const rst = new ReplSetTest({nodes: 2});
+rst.startSet();
+rst.initiate();
+
+const dbName = jsTestName();
+const collName = "coll";
+
+const primary = rst.getPrimary();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB[collName];
+const collNss = primaryColl.getFullName();
+
+jsTestLog("Do document writes");
+for (var i = 0; i < 4; i++) {
+ assert.commandWorked(primaryColl.insert({_id: i, y: i}, {"writeConcern": {"w": 1}}));
+}
+rst.awaitReplication();
+
+const secondary = rst.getSecondary();
+const secondaryAdmin = secondary.getDB("admin");
+const secondaryColl = secondary.getDB(dbName)[collName];
+
+assert.commandWorked(
+ secondary.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 2}));
+
+jsTestLog("Enable setYieldAllLocksHang fail point");
+assert.commandWorked(secondaryAdmin.runCommand(
+ {configureFailPoint: "setYieldAllLocksHang", data: {namespace: collNss}, mode: "alwaysOn"}));
+
+TestData.dbName = dbName;
+TestData.collName = collName;
+
+const indexThread = startParallelShell(() => {
+ jsTestLog("Create index");
+ const primaryDB = db.getSiblingDB(TestData.dbName);
+ assert.commandWorked(primaryDB[TestData.collName].createIndex({"y": 1}));
+}, primary.port);
+
+// Wait until index build (collection scan phase) on secondary yields.
+jsTestLog("Wait for the hybrid index build on secondary to hang");
+assert.soon(
+ function() {
+ const result = secondaryAdmin.currentOp({"command.createIndexes": collName});
+ assert.commandWorked(result);
+ if (result.inprog.length === 1 && result.inprog[0].numYields > 0) {
+ return true;
+ }
+
+ return false;
+ },
+ function() {
+ return "Failed to find operation in currentOp() output: " +
+ tojson(secondaryAdmin.currentOp());
+ },
+ 30 * 1000);
+
+jsTestLog("Step up the current secondary");
+assert.commandWorked(secondary.adminCommand({"replSetStepUp": 1}));
+waitForState(secondary, ReplSetTest.State.PRIMARY);
+waitForState(primary, ReplSetTest.State.SECONDARY);
+const newPrimary = rst.getPrimary();
+
+// Make sure the secondary was able to step up successfully.
+assert.eq(newPrimary, secondary);
+
+jsTestLog("Start a txn");
+const session = newPrimary.startSession();
+const sessionDB = session.getDatabase(dbName);
+const sessionColl = sessionDB.getCollection(collName);
+session.startTransaction();
+assert.commandWorked(sessionColl.insert({_id: 20}));
+
+jsTestLog("Prepare txn");
+const prepareTimestamp = PrepareHelpers.prepareTransaction(session);
+
+// Make the hybrid index build hang after first drain
+assert.commandWorked(newPrimary.adminCommand(
+ {configureFailPoint: "hangAfterIndexBuildFirstDrain", mode: "alwaysOn"}));
+
+// This will make the hybrid build previously started on secondary (now primary) resume.
+jsTestLog("Disable setYieldAllLocksHang fail point");
+assert.commandWorked(
+ newPrimary.adminCommand({configureFailPoint: "setYieldAllLocksHang", mode: "off"}));
+
+// Wait until the first drain completed.
+checkLog.contains(newPrimary, "Hanging after index build first drain");
+
+assert.commandWorked(
+ newPrimary.adminCommand({configureFailPoint: "hangAfterIndexBuildFirstDrain", mode: "off"}));
+
+jsTestLog("Step up the old primary");
+assert.commandWorked(primary.adminCommand({"replSetStepUp": 1}));
+waitForState(primary, ReplSetTest.State.PRIMARY);
+rst.awaitReplication();
+
+// Create a proxy session in order to complete the prepared transaction.
+const newSession = new _DelegatingDriverSession(primary, session);
+
+assert.commandWorked(PrepareHelpers.commitTransaction(newSession, prepareTimestamp));
+indexThread();
+rst.awaitReplication();
+
+assert.soon(() => 2 == primaryColl.getIndexes().length);
+assert.soon(() => 2 == secondaryColl.getIndexes().length);
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/catalog/index_builds_manager.cpp b/src/mongo/db/catalog/index_builds_manager.cpp
index 58154ea1042..7ad6d6b1c70 100644
--- a/src/mongo/db/catalog/index_builds_manager.cpp
+++ b/src/mongo/db/catalog/index_builds_manager.cpp
@@ -219,6 +219,11 @@ Status IndexBuildsManager::drainBackgroundWrites(OperationContext* opCtx,
return builder->drainBackgroundWrites(opCtx, readSource);
}
+bool IndexBuildsManager::areAllWritesApplied(OperationContext* opCtx, const UUID& buildUUID) {
+ auto builder = _getBuilder(buildUUID);
+ return builder->areAllWritesApplied(opCtx);
+}
+
Status IndexBuildsManager::finishBuildingPhase(const UUID& buildUUID) {
auto multiIndexBlockPtr = _getBuilder(buildUUID);
// TODO: verify that the index builder is in the expected state.
diff --git a/src/mongo/db/catalog/index_builds_manager.h b/src/mongo/db/catalog/index_builds_manager.h
index 6c3f14ad04e..d7c5218a92c 100644
--- a/src/mongo/db/catalog/index_builds_manager.h
+++ b/src/mongo/db/catalog/index_builds_manager.h
@@ -124,6 +124,12 @@ public:
RecoveryUnit::ReadSource readSource);
/**
+ * For the given index build, check whether all the side writes have been applied from all the
+ * side write tables.
+ */
+ bool areAllWritesApplied(OperationContext* opCtx, const UUID& buildUUID);
+
+ /**
* Persists information in the index catalog entry to reflect the successful completion of the
* scanning/insertion phase.
*
diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp
index 5426298bcc0..690e8cd83a1 100644
--- a/src/mongo/db/catalog/multi_index_block.cpp
+++ b/src/mongo/db/catalog/multi_index_block.cpp
@@ -727,6 +727,20 @@ Status MultiIndexBlock::dumpInsertsFromBulk(OperationContext* opCtx,
return Status::OK();
}
+bool MultiIndexBlock::areAllWritesApplied(OperationContext* opCtx) {
+ for (size_t i = 0; i < _indexes.size(); i++) {
+ auto interceptor = _indexes[i].block->getEntry()->indexBuildInterceptor();
+ if (!interceptor)
+ continue;
+
+ if (!interceptor->areAllWritesApplied(opCtx)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
Status MultiIndexBlock::drainBackgroundWrites(OperationContext* opCtx,
RecoveryUnit::ReadSource readSource) {
if (State::kAborted == _getState()) {
diff --git a/src/mongo/db/catalog/multi_index_block.h b/src/mongo/db/catalog/multi_index_block.h
index b126d4cc519..f456162f2e9 100644
--- a/src/mongo/db/catalog/multi_index_block.h
+++ b/src/mongo/db/catalog/multi_index_block.h
@@ -174,6 +174,11 @@ public:
Status dumpInsertsFromBulk(OperationContext* opCtx, std::set<RecordId>* const dupRecords);
/**
+ * Check whether all the side writes have been applied from all the side write tables.
+ */
+ bool areAllWritesApplied(OperationContext* opCtx);
+
+ /**
* For background indexes using an IndexBuildInterceptor to capture inserts during a build,
* drain these writes into the index. If intent locks are held on the collection, more writes
* may come in after this drain completes. To ensure that all writes are completely drained
diff --git a/src/mongo/db/index/index_build_interceptor.cpp b/src/mongo/db/index/index_build_interceptor.cpp
index a965feb4c60..34e4c203bab 100644
--- a/src/mongo/db/index/index_build_interceptor.cpp
+++ b/src/mongo/db/index/index_build_interceptor.cpp
@@ -357,7 +357,6 @@ bool IndexBuildInterceptor::areAllWritesApplied(OperationContext* opCtx) const {
invariant(_sideWritesTable);
auto cursor = _sideWritesTable->rs()->getCursor(opCtx);
auto record = cursor->next();
-
// The table is empty only when all writes are applied.
if (!record) {
auto writesRecorded = _sideWritesCounter->load();
@@ -366,9 +365,8 @@ bool IndexBuildInterceptor::areAllWritesApplied(OperationContext* opCtx) const {
<< "The number of side writes recorded does not match the number "
"applied, despite the table appearing empty. Writes recorded: "
<< writesRecorded << ", applied: " << _numApplied;
-
- dassert(writesRecorded == _numApplied, message);
- warning() << message;
+ log() << message;
+ return false;
}
return true;
}
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index f42a3609753..64cb431e4f4 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -893,32 +893,46 @@ void IndexBuildsCoordinator::_buildIndex(OperationContext* opCtx,
MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangAfterIndexBuildSecondDrain);
}
- // Need to return the collection lock back to exclusive mode, to complete the index build.
- opCtx->recoveryUnit()->abandonSnapshot();
- collLock->emplace(opCtx, nss, MODE_X);
-
- // We hold the database MODE_IX lock throughout the index build.
- auto db = DatabaseHolder::get(opCtx)->getDb(opCtx, nss.db());
- if (db) {
- auto& dss = DatabaseShardingState::get(db);
- auto dssLock = DatabaseShardingState::DSSLock::lockShared(opCtx, &dss);
- dss.checkDbVersion(opCtx, dssLock);
- }
-
- invariant(db,
- str::stream() << "Database not found after relocking. Index build: "
- << replState->buildUUID << ": " << nss << " ("
- << replState->collectionUUID << ")");
+ // Keep draining until we see everything.
+ while (true) {
+ // Need to return the collection lock back to exclusive mode, to complete the index build.
+ opCtx->recoveryUnit()->abandonSnapshot();
+ collLock->emplace(opCtx, nss, MODE_X);
+
+ // We hold the database MODE_IX lock throughout the index build.
+ auto db = DatabaseHolder::get(opCtx)->getDb(opCtx, nss.db());
+ if (db) {
+ auto& dss = DatabaseShardingState::get(db);
+ auto dssLock = DatabaseShardingState::DSSLock::lockShared(opCtx, &dss);
+ dss.checkDbVersion(opCtx, dssLock);
+ }
- invariant(db->getCollection(opCtx, nss),
- str::stream() << "Collection not found after relocking. Index build: "
- << replState->buildUUID << ": " << nss << " ("
- << replState->collectionUUID << ")");
+ invariant(db,
+ str::stream() << "Database not found after relocking. Index build: "
+ << replState->buildUUID << ": " << nss << " ("
+ << replState->collectionUUID << ")");
+
+ invariant(db->getCollection(opCtx, nss),
+ str::stream() << "Collection not found after relocking. Index build: "
+ << replState->buildUUID << ": " << nss << " ("
+ << replState->collectionUUID << ")");
+
+ // Perform the third and final drain after releasing a shared lock and reacquiring an
+ // exclusive lock on the database.
+ uassertStatusOK(_indexBuildsManager.drainBackgroundWrites(
+ opCtx, replState->buildUUID, RecoveryUnit::ReadSource::kUnset));
+
+ // Check that we saw everything in the side writes table that we needed to drain. If not,
+ // there must still be an outstanding prepared transaction.
+ if (_indexBuildsManager.areAllWritesApplied(opCtx, replState->buildUUID)) {
+ break;
+ }
- // Perform the third and final drain after releasing a shared lock and reacquiring an
- // exclusive lock on the database.
- uassertStatusOK(_indexBuildsManager.drainBackgroundWrites(
- opCtx, replState->buildUUID, RecoveryUnit::ReadSource::kUnset));
+ // Allow some time for the prepared transaction(s) to commit.
+ collLock->reset();
+ opCtx->sleepFor(Seconds(1));
+ // Try draining again.
+ }
// Index constraint checking phase.
uassertStatusOK(