diff options
author | Eric Milkie <milkie@10gen.com> | 2020-04-06 14:45:23 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-08 20:39:40 +0000 |
commit | 56b3fda768b6009c78310893d88e929025137291 (patch) | |
tree | 43de73be5e36b6def5a40aa1999e4a51cb1a84b8 | |
parent | 72e265eccbb6e49497eb386a0829ed75bae8c032 (diff) | |
download | mongo-56b3fda768b6009c78310893d88e929025137291.tar.gz |
SERVER-44507 retry hybrid index build drain mode until all prepared transactions on the collection have committed
-rw-r--r-- | jstests/replsets/index_build_stepdown_stepup_stepdown.js | 127 | ||||
-rw-r--r-- | src/mongo/db/catalog/index_builds_manager.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/catalog/index_builds_manager.h | 6 | ||||
-rw-r--r-- | src/mongo/db/catalog/multi_index_block.cpp | 14 | ||||
-rw-r--r-- | src/mongo/db/catalog/multi_index_block.h | 5 | ||||
-rw-r--r-- | src/mongo/db/index/index_build_interceptor.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator.cpp | 62 |
7 files changed, 197 insertions, 28 deletions
diff --git a/jstests/replsets/index_build_stepdown_stepup_stepdown.js b/jstests/replsets/index_build_stepdown_stepup_stepdown.js new file mode 100644 index 00000000000..30e51806196 --- /dev/null +++ b/jstests/replsets/index_build_stepdown_stepup_stepdown.js @@ -0,0 +1,127 @@ +/* + * This test starts an index build, then through a series of failovers, demonstrates that the index + * build does not miss any writes from outstanding prepared transactions. This is an issue since + * prepared transactions do not use locks on secondaries, and thus can allow an index build to + * commit without waiting for the prepared transaction. + * + * @tags: [ + * requires_document_locking, + * requires_replication, + * uses_prepare_transaction, + * uses_transactions, + * ] + */ +load("jstests/replsets/rslib.js"); +load("jstests/core/txns/libs/prepare_helpers.js"); + +(function() { + +"use strict"; + +const rst = new ReplSetTest({nodes: 2}); +rst.startSet(); +rst.initiate(); + +const dbName = jsTestName(); +const collName = "coll"; + +const primary = rst.getPrimary(); +const primaryDB = primary.getDB(dbName); +const primaryColl = primaryDB[collName]; +const collNss = primaryColl.getFullName(); + +jsTestLog("Do document writes"); +for (var i = 0; i < 4; i++) { + assert.commandWorked(primaryColl.insert({_id: i, y: i}, {"writeConcern": {"w": 1}})); +} +rst.awaitReplication(); + +const secondary = rst.getSecondary(); +const secondaryAdmin = secondary.getDB("admin"); +const secondaryColl = secondary.getDB(dbName)[collName]; + +assert.commandWorked( + secondary.adminCommand({setParameter: 1, internalQueryExecYieldIterations: 2})); + +jsTestLog("Enable setYieldAllLocksHang fail point"); +assert.commandWorked(secondaryAdmin.runCommand( + {configureFailPoint: "setYieldAllLocksHang", data: {namespace: collNss}, mode: "alwaysOn"})); + +TestData.dbName = dbName; +TestData.collName = collName; + +const indexThread = startParallelShell(() => { + jsTestLog("Create index"); + const primaryDB = db.getSiblingDB(TestData.dbName); + assert.commandWorked(primaryDB[TestData.collName].createIndex({"y": 1})); +}, primary.port); + +// Wait until index build (collection scan phase) on secondary yields. +jsTestLog("Wait for the hybrid index build on secondary to hang"); +assert.soon( + function() { + const result = secondaryAdmin.currentOp({"command.createIndexes": collName}); + assert.commandWorked(result); + if (result.inprog.length === 1 && result.inprog[0].numYields > 0) { + return true; + } + + return false; + }, + function() { + return "Failed to find operation in currentOp() output: " + + tojson(secondaryAdmin.currentOp()); + }, + 30 * 1000); + +jsTestLog("Step up the current secondary"); +assert.commandWorked(secondary.adminCommand({"replSetStepUp": 1})); +waitForState(secondary, ReplSetTest.State.PRIMARY); +waitForState(primary, ReplSetTest.State.SECONDARY); +const newPrimary = rst.getPrimary(); + +// Make sure the secondary was able to step up successfully. +assert.eq(newPrimary, secondary); + +jsTestLog("Start a txn"); +const session = newPrimary.startSession(); +const sessionDB = session.getDatabase(dbName); +const sessionColl = sessionDB.getCollection(collName); +session.startTransaction(); +assert.commandWorked(sessionColl.insert({_id: 20})); + +jsTestLog("Prepare txn"); +const prepareTimestamp = PrepareHelpers.prepareTransaction(session); + +// Make the hybrid index build hang after first drain +assert.commandWorked(newPrimary.adminCommand( + {configureFailPoint: "hangAfterIndexBuildFirstDrain", mode: "alwaysOn"})); + +// This will make the hybrid build previously started on secondary (now primary) resume. +jsTestLog("Disable setYieldAllLocksHang fail point"); +assert.commandWorked( + newPrimary.adminCommand({configureFailPoint: "setYieldAllLocksHang", mode: "off"})); + +// Wait until the first drain completed. +checkLog.contains(newPrimary, "Hanging after index build first drain"); + +assert.commandWorked( + newPrimary.adminCommand({configureFailPoint: "hangAfterIndexBuildFirstDrain", mode: "off"})); + +jsTestLog("Step up the old primary"); +assert.commandWorked(primary.adminCommand({"replSetStepUp": 1})); +waitForState(primary, ReplSetTest.State.PRIMARY); +rst.awaitReplication(); + +// Create a proxy session in order to complete the prepared transaction. +const newSession = new _DelegatingDriverSession(primary, session); + +assert.commandWorked(PrepareHelpers.commitTransaction(newSession, prepareTimestamp)); +indexThread(); +rst.awaitReplication(); + +assert.soon(() => 2 == primaryColl.getIndexes().length); +assert.soon(() => 2 == secondaryColl.getIndexes().length); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/catalog/index_builds_manager.cpp b/src/mongo/db/catalog/index_builds_manager.cpp index 58154ea1042..7ad6d6b1c70 100644 --- a/src/mongo/db/catalog/index_builds_manager.cpp +++ b/src/mongo/db/catalog/index_builds_manager.cpp @@ -219,6 +219,11 @@ Status IndexBuildsManager::drainBackgroundWrites(OperationContext* opCtx, return builder->drainBackgroundWrites(opCtx, readSource); } +bool IndexBuildsManager::areAllWritesApplied(OperationContext* opCtx, const UUID& buildUUID) { + auto builder = _getBuilder(buildUUID); + return builder->areAllWritesApplied(opCtx); +} + Status IndexBuildsManager::finishBuildingPhase(const UUID& buildUUID) { auto multiIndexBlockPtr = _getBuilder(buildUUID); // TODO: verify that the index builder is in the expected state. diff --git a/src/mongo/db/catalog/index_builds_manager.h b/src/mongo/db/catalog/index_builds_manager.h index 6c3f14ad04e..d7c5218a92c 100644 --- a/src/mongo/db/catalog/index_builds_manager.h +++ b/src/mongo/db/catalog/index_builds_manager.h @@ -124,6 +124,12 @@ public: RecoveryUnit::ReadSource readSource); /** + * For the given index build, check whether all the side writes have been applied from all the + * side write tables. + */ + bool areAllWritesApplied(OperationContext* opCtx, const UUID& buildUUID); + + /** * Persists information in the index catalog entry to reflect the successful completion of the * scanning/insertion phase. * diff --git a/src/mongo/db/catalog/multi_index_block.cpp b/src/mongo/db/catalog/multi_index_block.cpp index 5426298bcc0..690e8cd83a1 100644 --- a/src/mongo/db/catalog/multi_index_block.cpp +++ b/src/mongo/db/catalog/multi_index_block.cpp @@ -727,6 +727,20 @@ Status MultiIndexBlock::dumpInsertsFromBulk(OperationContext* opCtx, return Status::OK(); } +bool MultiIndexBlock::areAllWritesApplied(OperationContext* opCtx) { + for (size_t i = 0; i < _indexes.size(); i++) { + auto interceptor = _indexes[i].block->getEntry()->indexBuildInterceptor(); + if (!interceptor) + continue; + + if (!interceptor->areAllWritesApplied(opCtx)) { + return false; + } + } + + return true; +} + Status MultiIndexBlock::drainBackgroundWrites(OperationContext* opCtx, RecoveryUnit::ReadSource readSource) { if (State::kAborted == _getState()) { diff --git a/src/mongo/db/catalog/multi_index_block.h b/src/mongo/db/catalog/multi_index_block.h index b126d4cc519..f456162f2e9 100644 --- a/src/mongo/db/catalog/multi_index_block.h +++ b/src/mongo/db/catalog/multi_index_block.h @@ -174,6 +174,11 @@ public: Status dumpInsertsFromBulk(OperationContext* opCtx, std::set<RecordId>* const dupRecords); /** + * Check whether all the side writes have been applied from all the side write tables. + */ + bool areAllWritesApplied(OperationContext* opCtx); + + /** * For background indexes using an IndexBuildInterceptor to capture inserts during a build, * drain these writes into the index. If intent locks are held on the collection, more writes * may come in after this drain completes. To ensure that all writes are completely drained diff --git a/src/mongo/db/index/index_build_interceptor.cpp b/src/mongo/db/index/index_build_interceptor.cpp index a965feb4c60..34e4c203bab 100644 --- a/src/mongo/db/index/index_build_interceptor.cpp +++ b/src/mongo/db/index/index_build_interceptor.cpp @@ -357,7 +357,6 @@ bool IndexBuildInterceptor::areAllWritesApplied(OperationContext* opCtx) const { invariant(_sideWritesTable); auto cursor = _sideWritesTable->rs()->getCursor(opCtx); auto record = cursor->next(); - // The table is empty only when all writes are applied. if (!record) { auto writesRecorded = _sideWritesCounter->load(); @@ -366,9 +365,8 @@ bool IndexBuildInterceptor::areAllWritesApplied(OperationContext* opCtx) const { << "The number of side writes recorded does not match the number " "applied, despite the table appearing empty. Writes recorded: " << writesRecorded << ", applied: " << _numApplied; - - dassert(writesRecorded == _numApplied, message); - warning() << message; + log() << message; + return false; } return true; } diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index f42a3609753..64cb431e4f4 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -893,32 +893,46 @@ void IndexBuildsCoordinator::_buildIndex(OperationContext* opCtx, MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangAfterIndexBuildSecondDrain); } - // Need to return the collection lock back to exclusive mode, to complete the index build. - opCtx->recoveryUnit()->abandonSnapshot(); - collLock->emplace(opCtx, nss, MODE_X); - - // We hold the database MODE_IX lock throughout the index build. - auto db = DatabaseHolder::get(opCtx)->getDb(opCtx, nss.db()); - if (db) { - auto& dss = DatabaseShardingState::get(db); - auto dssLock = DatabaseShardingState::DSSLock::lockShared(opCtx, &dss); - dss.checkDbVersion(opCtx, dssLock); - } - - invariant(db, - str::stream() << "Database not found after relocking. Index build: " - << replState->buildUUID << ": " << nss << " (" - << replState->collectionUUID << ")"); + // Keep draining until we see everything. + while (true) { + // Need to return the collection lock back to exclusive mode, to complete the index build. + opCtx->recoveryUnit()->abandonSnapshot(); + collLock->emplace(opCtx, nss, MODE_X); + + // We hold the database MODE_IX lock throughout the index build. + auto db = DatabaseHolder::get(opCtx)->getDb(opCtx, nss.db()); + if (db) { + auto& dss = DatabaseShardingState::get(db); + auto dssLock = DatabaseShardingState::DSSLock::lockShared(opCtx, &dss); + dss.checkDbVersion(opCtx, dssLock); + } - invariant(db->getCollection(opCtx, nss), - str::stream() << "Collection not found after relocking. Index build: " - << replState->buildUUID << ": " << nss << " (" - << replState->collectionUUID << ")"); + invariant(db, + str::stream() << "Database not found after relocking. Index build: " + << replState->buildUUID << ": " << nss << " (" + << replState->collectionUUID << ")"); + + invariant(db->getCollection(opCtx, nss), + str::stream() << "Collection not found after relocking. Index build: " + << replState->buildUUID << ": " << nss << " (" + << replState->collectionUUID << ")"); + + // Perform the third and final drain after releasing a shared lock and reacquiring an + // exclusive lock on the database. + uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( + opCtx, replState->buildUUID, RecoveryUnit::ReadSource::kUnset)); + + // Check that we saw everything in the side writes table that we needed to drain. If not, + // there must still be an outstanding prepared transaction. + if (_indexBuildsManager.areAllWritesApplied(opCtx, replState->buildUUID)) { + break; + } - // Perform the third and final drain after releasing a shared lock and reacquiring an - // exclusive lock on the database. - uassertStatusOK(_indexBuildsManager.drainBackgroundWrites( - opCtx, replState->buildUUID, RecoveryUnit::ReadSource::kUnset)); + // Allow some time for the prepared transaction(s) to commit. + collLock->reset(); + opCtx->sleepFor(Seconds(1)); + // Try draining again. + } // Index constraint checking phase. uassertStatusOK( |