summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHuayu Ouyang <huayu.ouyang@mongodb.com>2021-04-01 17:35:40 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-04-07 14:43:34 +0000
commit3de4ca147e62d806bfc73852a8e1cd1fa65359d0 (patch)
tree217800df6a189401c27b38ee3e5f8be536e8ccb9
parent6b7f536f55941a738b76b675d6aa958d39c4a5cd (diff)
downloadmongo-3de4ca147e62d806bfc73852a8e1cd1fa65359d0.tar.gz
SERVER-55008 Only abort two-phase index builds when BackgroundOperationInProg error in initial sync
-rw-r--r--jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js3
-rw-r--r--jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js101
-rw-r--r--src/mongo/db/index_builds_coordinator.cpp57
-rw-r--r--src/mongo/db/index_builds_coordinator.h6
-rw-r--r--src/mongo/db/repl/oplog.cpp44
5 files changed, 179 insertions, 32 deletions
diff --git a/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js b/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js
index 77c2f37ba47..000f72410a9 100644
--- a/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js
+++ b/jstests/noPassthrough/initial_sync_aborts_two_phase_index_builds.js
@@ -79,7 +79,8 @@ rst.awaitReplication();
rst.awaitSecondaryNodes();
// Check the that secondary hit the background operation in progress error.
-checkLog.containsJson(secondary, 23879, {reason: "Aborting index builds during initial sync"});
+checkLog.containsJson(
+ secondary, 5500800, {reason: "Aborting two phase index builds during initial sync"});
IndexBuildTest.resumeIndexBuilds(secondary);
diff --git a/jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js b/jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js
new file mode 100644
index 00000000000..01a37b6168d
--- /dev/null
+++ b/jstests/replsets/initial_sync_avoids_aborting_single_phase_index_builds.js
@@ -0,0 +1,101 @@
+/**
+ * Tests that if an initial syncing node runs an index build and a conflicting DDL operation
+ * at the same time during the oplog application phase, it will not abort single phase index builds.
+ *
+ * @tags: [requires_fcv_44]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/replsets/rslib.js");
+load("jstests/noPassthrough/libs/index_build.js");
+
+function runTest(runDbCommand) {
+ const name = jsTestName();
+ // Disable two phase index builds so the replica set only runs single phase index builds.
+ const rst = new ReplSetTest({
+ name,
+ nodes: 1,
+ nodeOptions: {setParameter: "enableTwoPhaseIndexBuild=0"},
+ });
+ rst.startSet();
+ rst.initiateWithHighElectionTimeout();
+
+ const primary = rst.getPrimary();
+ const dbName = jsTestName();
+ const collName = "coll";
+ const primaryDb = primary.getDB(dbName);
+ const primaryColl = primaryDb.getCollection(collName);
+ const collFullName = primaryColl.getFullName();
+
+ // Insert initial data to ensure that the repl set is initialized correctly.
+ assert.commandWorked(primaryColl.insert({a: 1}));
+ rst.awaitReplication();
+
+ jsTestLog("Adding a new node to the replica set");
+ const initialSyncNode = rst.add({});
+
+ const hangAfterDataCloning =
+ configureFailPoint(initialSyncNode, "initialSyncHangAfterDataCloning");
+ const indexHang = configureFailPoint(initialSyncNode, "hangAfterStartingIndexBuild");
+
+ jsTestLog("Waiting for initial sync node to reach initial sync state");
+ rst.reInitiate();
+ rst.waitForState(initialSyncNode, ReplSetTest.State.STARTUP_2);
+
+ // Hang the initial sync node after the data cloning phase to ensure that the node will
+ // apply the index build oplog entry as part of the oplog application phase of initial sync.
+ hangAfterDataCloning.wait();
+
+ jsTestLog("Creating index build and running DDL operation on primary");
+ primaryColl.createIndex({a: 1});
+
+ let awaitDropDatabase;
+ if (runDbCommand) {
+ TestData.dbName = dbName;
+ awaitDropDatabase = startParallelShell(() => {
+ assert.commandWorked(db.getSiblingDB(TestData.dbName).dropDatabase());
+ }, primary.port);
+ checkLog.containsJson(primary, 20337);
+ } else {
+ assert.commandWorked(primaryColl.renameCollection('coll2'));
+ }
+
+ jsTestLog("Continuing with initial sync and hanging on index build");
+ hangAfterDataCloning.off();
+
+ // Hang the index build after starting so that a DDL operation
+ // will conflict with the index build, which will attempt to abort two phase index builds.
+ indexHang.wait();
+
+ // Both the dropDatabase command and the renameCollection command will try to abort
+ // collection index builds because when the primary receives a dropDatabase command, it first
+ // drops collections, and on the secondary a dropCollection command is actually run before
+ // the dropDatabase command.
+ checkLog.containsJson(
+ initialSyncNode,
+ 5500800,
+ {reason: "Aborting two phase index builds during initial sync", namespace: collFullName});
+
+ jsTestLog("Wait for the initial sync to succeed");
+ indexHang.off();
+ waitForState(initialSyncNode, ReplSetTest.State.SECONDARY);
+
+ jsTestLog("Check that the single phase index build should have successfully been built");
+ if (runDbCommand) {
+ // Check that the log contains line indicating that the index build has successfully
+ // completed.
+ checkLog.containsJson(initialSyncNode, 20663, {"namespace": collFullName});
+ awaitDropDatabase();
+ } else {
+ IndexBuildTest.assertIndexes(
+ initialSyncNode.getDB(dbName).getCollection("coll2"), 2, ["_id_", "a_1"]);
+ }
+
+ rst.stopSet();
+}
+
+runTest(false);
+runTest(true);
+})();
diff --git a/src/mongo/db/index_builds_coordinator.cpp b/src/mongo/db/index_builds_coordinator.cpp
index 1e2da159ce3..312be2202c5 100644
--- a/src/mongo/db/index_builds_coordinator.cpp
+++ b/src/mongo/db/index_builds_coordinator.cpp
@@ -570,17 +570,32 @@ std::vector<UUID> IndexBuildsCoordinator::abortCollectionIndexBuilds(
OperationContext* opCtx,
const NamespaceString collectionNss,
const UUID collectionUUID,
- const std::string& reason) {
- LOGV2(23879,
- "About to abort all index builders",
- "namespace"_attr = collectionNss,
- "uuid"_attr = collectionUUID,
- "reason"_attr = reason);
+ const std::string& reason,
+ const bool onlyAbortTwoPhaseIndexBuilds) {
+ if (onlyAbortTwoPhaseIndexBuilds) {
+ LOGV2(5500800,
+ "About to abort all two phase index builders",
+ "namespace"_attr = collectionNss,
+ "uuid"_attr = collectionUUID,
+ "reason"_attr = reason);
+ } else {
+ LOGV2(23879,
+ "About to abort all index builders",
+ "namespace"_attr = collectionNss,
+ "uuid"_attr = collectionUUID,
+ "reason"_attr = reason);
+ }
+
auto collIndexBuilds = [&]() -> std::vector<std::shared_ptr<ReplIndexBuildState>> {
stdx::unique_lock<Latch> lk(_mutex);
auto indexBuildFilter = [=](const auto& replState) {
- return collectionUUID == replState.collectionUUID;
+ if (onlyAbortTwoPhaseIndexBuilds) {
+ return collectionUUID == replState.collectionUUID &&
+ IndexBuildProtocol::kTwoPhase == replState.protocol;
+ } else {
+ return collectionUUID == replState.collectionUUID;
+ }
};
return _filterIndexBuilds_inlock(lk, indexBuildFilter);
}();
@@ -608,15 +623,31 @@ void IndexBuildsCoordinator::_awaitNoBgOpInProgForDb(stdx::unique_lock<Latch>& l
void IndexBuildsCoordinator::abortDatabaseIndexBuilds(OperationContext* opCtx,
StringData db,
- const std::string& reason) {
- LOGV2(4612302,
- "About to abort all index builders running for collections in the given database",
- "database"_attr = db,
- "reason"_attr = reason);
+ const std::string& reason,
+ const bool onlyAbortTwoPhaseIndexBuilds) {
+ if (onlyAbortTwoPhaseIndexBuilds) {
+ LOGV2(5500801,
+ "About to abort all two phase index builders running for collections in the given "
+ "database",
+ "database"_attr = db,
+ "reason"_attr = reason);
+ } else {
+ LOGV2(4612302,
+ "About to abort all index builders running for collections in the given database",
+ "database"_attr = db,
+ "reason"_attr = reason);
+ }
auto builds = [&]() -> std::vector<std::shared_ptr<ReplIndexBuildState>> {
stdx::unique_lock<Latch> lk(_mutex);
- auto indexBuildFilter = [=](const auto& replState) { return db == replState.dbName; };
+ auto indexBuildFilter = [=](const auto& replState) {
+ if (onlyAbortTwoPhaseIndexBuilds) {
+ return db == replState.dbName &&
+ IndexBuildProtocol::kTwoPhase == replState.protocol;
+ } else {
+ return db == replState.dbName;
+ }
+ };
return _filterIndexBuilds_inlock(lk, indexBuildFilter);
}();
for (auto replState : builds) {
diff --git a/src/mongo/db/index_builds_coordinator.h b/src/mongo/db/index_builds_coordinator.h
index e6c2e01388a..958ceb147cd 100644
--- a/src/mongo/db/index_builds_coordinator.h
+++ b/src/mongo/db/index_builds_coordinator.h
@@ -222,7 +222,8 @@ public:
std::vector<UUID> abortCollectionIndexBuilds(OperationContext* opCx,
const NamespaceString collectionNss,
const UUID collectionUUID,
- const std::string& reason);
+ const std::string& reason,
+ const bool onlyAbortTwoPhaseIndexBuilds = false);
/**
* Signals all of the index builds on the specified 'db' to abort and then waits until the index
@@ -235,7 +236,8 @@ public:
*/
void abortDatabaseIndexBuilds(OperationContext* opCtx,
StringData db,
- const std::string& reason);
+ const std::string& reason,
+ const bool onlyAbortTwoPhaseIndexBuilds = false);
/**
* Signals all of the index builds to abort and then waits until the index builds are no longer
diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp
index 5a2fa47cca4..1ea309f24e0 100644
--- a/src/mongo/db/repl/oplog.cpp
+++ b/src/mongo/db/repl/oplog.cpp
@@ -158,13 +158,14 @@ bool shouldBuildInForeground(OperationContext* opCtx,
return false;
}
-void abortIndexBuilds(OperationContext* opCtx,
- const OplogEntry::CommandType& commandType,
- const NamespaceString& nss,
- const std::string& reason) {
+void abortTwoPhaseIndexBuilds(OperationContext* opCtx,
+ const OplogEntry::CommandType& commandType,
+ const NamespaceString& nss,
+ const std::string& reason) {
auto indexBuildsCoordinator = IndexBuildsCoordinator::get(opCtx);
if (commandType == OplogEntry::CommandType::kDropDatabase) {
- indexBuildsCoordinator->abortDatabaseIndexBuilds(opCtx, nss.db(), reason);
+ indexBuildsCoordinator->abortDatabaseIndexBuilds(
+ opCtx, nss.db(), reason, true /* onlyAbortTwoPhaseIndexBuilds */);
} else if (commandType == OplogEntry::CommandType::kDrop ||
commandType == OplogEntry::CommandType::kDropIndexes ||
commandType == OplogEntry::CommandType::kRenameCollection) {
@@ -172,7 +173,8 @@ void abortIndexBuilds(OperationContext* opCtx,
CollectionCatalog::get(opCtx).lookupUUIDByNSS(opCtx, nss);
invariant(collUUID);
- indexBuildsCoordinator->abortCollectionIndexBuilds(opCtx, nss, *collUUID, reason);
+ indexBuildsCoordinator->abortCollectionIndexBuilds(
+ opCtx, nss, *collUUID, reason, true /* onlyAbortTwoPhaseIndexBuilds */);
}
}
@@ -1522,14 +1524,19 @@ Status applyCommand_inlock(OperationContext* opCtx,
}
case ErrorCodes::BackgroundOperationInProgressForDatabase: {
if (mode == OplogApplication::Mode::kInitialSync) {
- abortIndexBuilds(opCtx,
- entry.getCommandType(),
- nss,
- "Aborting index builds during initial sync");
+ // We only want to abort two phase index builds on an initial syncing node
+ // because single phase index builds don't communicate back to the primary
+ // and they may have already been completed on the primary.
+ // For single phase index builds, we instead wait for the index build to
+ // complete before applying the conflicting DDL operation.
+ abortTwoPhaseIndexBuilds(opCtx,
+ entry.getCommandType(),
+ nss,
+ "Aborting two phase index builds during initial sync");
LOGV2_DEBUG(4665900,
1,
"Conflicting DDL operation encountered during initial sync; "
- "aborting index build and retrying",
+ "aborting two phase index builds and retrying",
"db"_attr = nss.db());
}
@@ -1557,14 +1564,19 @@ Status applyCommand_inlock(OperationContext* opCtx,
auto ns = cmd->parse(opCtx, OpMsgRequest::fromDBAndBody(nss.db(), o))->ns();
if (mode == OplogApplication::Mode::kInitialSync) {
- abortIndexBuilds(opCtx,
- entry.getCommandType(),
- ns,
- "Aborting index builds during initial sync");
+ // We only want to abort two phase index builds on an initial syncing node
+ // because single phase index builds don't communicate back to the primary
+ // and they may have already been completed on the primary.
+ // For single phase index builds, we instead wait for the index build to
+ // complete before applying the conflicting DDL operation.
+ abortTwoPhaseIndexBuilds(opCtx,
+ entry.getCommandType(),
+ ns,
+ "Aborting two phase index builds during initial sync");
LOGV2_DEBUG(4665901,
1,
"Conflicting DDL operation encountered during initial sync; "
- "aborting index build and retrying",
+ "aborting two phase index build and retrying",
"namespace"_attr = ns);
}