diff options
author | Huayu Ouyang <huayu.ouyang@mongodb.com> | 2021-04-01 17:35:40 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-04-07 14:43:34 +0000 |
commit | 3de4ca147e62d806bfc73852a8e1cd1fa65359d0 (patch) | |
tree | 217800df6a189401c27b38ee3e5f8be536e8ccb9 | |
parent | 6b7f536f55941a738b76b675d6aa958d39c4a5cd (diff) | |
download | mongo-3de4ca147e62d806bfc73852a8e1cd1fa65359d0.tar.gz |
SERVER-55008 Only abort two-phase index builds when BackgroundOperationInProg error in initial sync
-rw-r--r-- | jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js | 3 | ||||
-rw-r--r-- | jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js | 101 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator.cpp | 57 | ||||
-rw-r--r-- | src/mongo/db/index_builds_coordinator.h | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog.cpp | 44 |
5 files changed, 179 insertions, 32 deletions
diff --git a/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js b/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js index 77c2f37ba47..000f72410a9 100644 --- a/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js +++ b/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js @@ -79,7 +79,8 @@ rst.awaitReplication(); rst.awaitSecondaryNodes(); // Check the that secondary hit the background operation in progress error. -checkLog.containsJson(secondary, 23879, {reason: "Aborting index builds during initial sync"}); +checkLog.containsJson( + secondary, 5500800, {reason: "Aborting two phase index builds during initial sync"}); IndexBuildTest.resumeIndexBuilds(secondary); diff --git a/jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js b/jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js new file mode 100644 index 00000000000..01a37b6168d --- /dev/null +++ b/jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js @@ -0,0 +1,101 @@ +/** + * Tests that if an initial syncing node runs an index build and a conflicting DDL operation + * at the same time during the oplog application phase, it will not abort single phase index builds. + * + * @tags: [requires_fcv_44] + */ +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); +load("jstests/noPassthrough/libs/index_build.js"); + +function runTest(runDbCommand) { + const name = jsTestName(); + // Disable two phase index builds so the replica set only runs single phase index builds. + const rst = new ReplSetTest({ + name, + nodes: 1, + nodeOptions: {setParameter: "enableTwoPhaseIndexBuild=0"}, + }); + rst.startSet(); + rst.initiateWithHighElectionTimeout(); + + const primary = rst.getPrimary(); + const dbName = jsTestName(); + const collName = "coll"; + const primaryDb = primary.getDB(dbName); + const primaryColl = primaryDb.getCollection(collName); + const collFullName = primaryColl.getFullName(); + + // Insert initial data to ensure that the repl set is initialized correctly. + assert.commandWorked(primaryColl.insert({a: 1})); + rst.awaitReplication(); + + jsTestLog("Adding a new node to the replica set"); + const initialSyncNode = rst.add({}); + + const hangAfterDataCloning = + configureFailPoint(initialSyncNode, "initialSyncHangAfterDataCloning"); + const indexHang = configureFailPoint(initialSyncNode, "hangAfterStartingIndexBuild"); + + jsTestLog("Waiting for initial sync node to reach initial sync state"); + rst.reInitiate(); + rst.waitForState(initialSyncNode, ReplSetTest.State.STARTUP_2); + + // Hang the initial sync node after the data cloning phase to ensure that the node will + // apply the index build oplog entry as part of the oplog application phase of initial sync. + hangAfterDataCloning.wait(); + + jsTestLog("Creating index build and running DDL operation on primary"); + primaryColl.createIndex({a: 1}); + + let awaitDropDatabase; + if (runDbCommand) { + TestData.dbName = dbName; + awaitDropDatabase = startParallelShell(() => { + assert.commandWorked(db.getSiblingDB(TestData.dbName).dropDatabase()); + }, primary.port); + checkLog.containsJson(primary, 20337); + } else { + assert.commandWorked(primaryColl.renameCollection('coll2')); + } + + jsTestLog("Continuing with initial sync and hanging on index build"); + hangAfterDataCloning.off(); + + // Hang the index build after starting so that a DDL operation + // will conflict with the index build, which will attempt to abort two phase index builds. + indexHang.wait(); + + // Both the dropDatabase command and the renameCollection command will try to abort + // collection index builds because when the primary receives a dropDatabase command, it first + // drops collections, and on the secondary a dropCollection command is actually run before + // the dropDatabase command. + checkLog.containsJson( + initialSyncNode, + 5500800, + {reason: "Aborting two phase index builds during initial sync", namespace: collFullName}); + + jsTestLog("Wait for the initial sync to succeed"); + indexHang.off(); + waitForState(initialSyncNode, ReplSetTest.State.SECONDARY); + + jsTestLog("Check that the single phase index build should have successfully been built"); + if (runDbCommand) { + // Check that the log contains line indicating that the index build has successfully + // completed. + checkLog.containsJson(initialSyncNode, 20663, {"namespace": collFullName}); + awaitDropDatabase(); + } else { + IndexBuildTest.assertIndexes( + initialSyncNode.getDB(dbName).getCollection("coll2"), 2, ["_id_", "a_1"]); + } + + rst.stopSet(); +} + +runTest(false); +runTest(true); +})(); diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp index 1e2da159ce3..312be2202c5 100644 --- a/src/mongo/db/index_builds_coordinator.cpp +++ b/src/mongo/db/index_builds_coordinator.cpp @@ -570,17 +570,32 @@ std::vector<UUID> IndexBuildsCoordinator::abortCollectionIndexBuilds( OperationContext* opCtx, const NamespaceString collectionNss, const UUID collectionUUID, - const std::string& reason) { - LOGV2(23879, - "About to abort all index builders", - "namespace"_attr = collectionNss, - "uuid"_attr = collectionUUID, - "reason"_attr = reason); + const std::string& reason, + const bool onlyAbortTwoPhaseIndexBuilds) { + if (onlyAbortTwoPhaseIndexBuilds) { + LOGV2(5500800, + "About to abort all two phase index builders", + "namespace"_attr = collectionNss, + "uuid"_attr = collectionUUID, + "reason"_attr = reason); + } else { + LOGV2(23879, + "About to abort all index builders", + "namespace"_attr = collectionNss, + "uuid"_attr = collectionUUID, + "reason"_attr = reason); + } + auto collIndexBuilds = [&]() -> std::vector<std::shared_ptr<ReplIndexBuildState>> { stdx::unique_lock<Latch> lk(_mutex); auto indexBuildFilter = [=](const auto& replState) { - return collectionUUID == replState.collectionUUID; + if (onlyAbortTwoPhaseIndexBuilds) { + return collectionUUID == replState.collectionUUID && + IndexBuildProtocol::kTwoPhase == replState.protocol; + } else { + return collectionUUID == replState.collectionUUID; + } }; return _filterIndexBuilds_inlock(lk, indexBuildFilter); }(); @@ -608,15 +623,31 @@ void IndexBuildsCoordinator::_awaitNoBgOpInProgForDb(stdx::unique_lock<Latch>& l void IndexBuildsCoordinator::abortDatabaseIndexBuilds(OperationContext* opCtx, StringData db, - const std::string& reason) { - LOGV2(4612302, - "About to abort all index builders running for collections in the given database", - "database"_attr = db, - "reason"_attr = reason); + const std::string& reason, + const bool onlyAbortTwoPhaseIndexBuilds) { + if (onlyAbortTwoPhaseIndexBuilds) { + LOGV2(5500801, + "About to abort all two phase index builders running for collections in the given " + "database", + "database"_attr = db, + "reason"_attr = reason); + } else { + LOGV2(4612302, + "About to abort all index builders running for collections in the given database", + "database"_attr = db, + "reason"_attr = reason); + } auto builds = [&]() -> std::vector<std::shared_ptr<ReplIndexBuildState>> { stdx::unique_lock<Latch> lk(_mutex); - auto indexBuildFilter = [=](const auto& replState) { return db == replState.dbName; }; + auto indexBuildFilter = [=](const auto& replState) { + if (onlyAbortTwoPhaseIndexBuilds) { + return db == replState.dbName && + IndexBuildProtocol::kTwoPhase == replState.protocol; + } else { + return db == replState.dbName; + } + }; return _filterIndexBuilds_inlock(lk, indexBuildFilter); }(); for (auto replState : builds) { diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h index e6c2e01388a..958ceb147cd 100644 --- a/src/mongo/db/index_builds_coordinator.h +++ b/src/mongo/db/index_builds_coordinator.h @@ -222,7 +222,8 @@ public: std::vector<UUID> abortCollectionIndexBuilds(OperationContext* opCx, const NamespaceString collectionNss, const UUID collectionUUID, - const std::string& reason); + const std::string& reason, + const bool onlyAbortTwoPhaseIndexBuilds = false); /** * Signals all of the index builds on the specified 'db' to abort and then waits until the index @@ -235,7 +236,8 @@ public: */ void abortDatabaseIndexBuilds(OperationContext* opCtx, StringData db, - const std::string& reason); + const std::string& reason, + const bool onlyAbortTwoPhaseIndexBuilds = false); /** * Signals all of the index builds to abort and then waits until the index builds are no longer diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index 5a2fa47cca4..1ea309f24e0 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -158,13 +158,14 @@ bool shouldBuildInForeground(OperationContext* opCtx, return false; } -void abortIndexBuilds(OperationContext* opCtx, - const OplogEntry::CommandType& commandType, - const NamespaceString& nss, - const std::string& reason) { +void abortTwoPhaseIndexBuilds(OperationContext* opCtx, + const OplogEntry::CommandType& commandType, + const NamespaceString& nss, + const std::string& reason) { auto indexBuildsCoordinator = IndexBuildsCoordinator::get(opCtx); if (commandType == OplogEntry::CommandType::kDropDatabase) { - indexBuildsCoordinator->abortDatabaseIndexBuilds(opCtx, nss.db(), reason); + indexBuildsCoordinator->abortDatabaseIndexBuilds( + opCtx, nss.db(), reason, true /* onlyAbortTwoPhaseIndexBuilds */); } else if (commandType == OplogEntry::CommandType::kDrop || commandType == OplogEntry::CommandType::kDropIndexes || commandType == OplogEntry::CommandType::kRenameCollection) { @@ -172,7 +173,8 @@ void abortIndexBuilds(OperationContext* opCtx, CollectionCatalog::get(opCtx).lookupUUIDByNSS(opCtx, nss); invariant(collUUID); - indexBuildsCoordinator->abortCollectionIndexBuilds(opCtx, nss, *collUUID, reason); + indexBuildsCoordinator->abortCollectionIndexBuilds( + opCtx, nss, *collUUID, reason, true /* onlyAbortTwoPhaseIndexBuilds */); } } @@ -1522,14 +1524,19 @@ Status applyCommand_inlock(OperationContext* opCtx, } case ErrorCodes::BackgroundOperationInProgressForDatabase: { if (mode == OplogApplication::Mode::kInitialSync) { - abortIndexBuilds(opCtx, - entry.getCommandType(), - nss, - "Aborting index builds during initial sync"); + // We only want to abort two phase index builds on an initial syncing node + // because single phase index builds don't communicate back to the primary + // and they may have already been completed on the primary. + // For single phase index builds, we instead wait for the index build to + // complete before applying the conflicting DDL operation. + abortTwoPhaseIndexBuilds(opCtx, + entry.getCommandType(), + nss, + "Aborting two phase index builds during initial sync"); LOGV2_DEBUG(4665900, 1, "Conflicting DDL operation encountered during initial sync; " - "aborting index build and retrying", + "aborting two phase index builds and retrying", "db"_attr = nss.db()); } @@ -1557,14 +1564,19 @@ Status applyCommand_inlock(OperationContext* opCtx, auto ns = cmd->parse(opCtx, OpMsgRequest::fromDBAndBody(nss.db(), o))->ns(); if (mode == OplogApplication::Mode::kInitialSync) { - abortIndexBuilds(opCtx, - entry.getCommandType(), - ns, - "Aborting index builds during initial sync"); + // We only want to abort two phase index builds on an initial syncing node + // because single phase index builds don't communicate back to the primary + // and they may have already been completed on the primary. + // For single phase index builds, we instead wait for the index build to + // complete before applying the conflicting DDL operation. + abortTwoPhaseIndexBuilds(opCtx, + entry.getCommandType(), + ns, + "Aborting two phase index builds during initial sync"); LOGV2_DEBUG(4665901, 1, "Conflicting DDL operation encountered during initial sync; " - "aborting index build and retrying", + "aborting two phase index build and retrying", "namespace"_attr = ns); } |