From a22f3c599e1e588cdb5916785f27af7cd5493d18 Mon Sep 17 00:00:00 2001 From: Tommaso Tocci Date: Fri, 4 Feb 2022 09:04:29 +0000 Subject: SERVER-63207 Speedup very slow move_primary_with_writes.js test --- jstests/sharding/move_primary_with_writes.js | 35 ++++++++++++++++++++++------ src/mongo/db/s/sharding_ddl_coordinator.cpp | 20 ++++++++++++---- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/jstests/sharding/move_primary_with_writes.js b/jstests/sharding/move_primary_with_writes.js index b31647b5789..09783f8dc1d 100644 --- a/jstests/sharding/move_primary_with_writes.js +++ b/jstests/sharding/move_primary_with_writes.js @@ -4,7 +4,7 @@ (function() { 'use strict'; -load('jstests/sharding/libs/sharded_transactions_helpers.js'); +load('jstests/libs/fail_point_util.js'); let st = new ShardingTest({ mongos: 2, @@ -177,15 +177,18 @@ function buildDDLCommands(collName) { } function testMovePrimary(failpoint, fromShard, toShard, db, shouldFail, sharded) { + jsTestLog("Testing move primary with FP: " + failpoint + " shouldFail: " + shouldFail + + " sharded: " + sharded); + let codeToRunInParallelShell = '{ db.getSiblingDB("admin").runCommand({movePrimary: "' + dbName + '", to: "' + toShard.name + '"}); }'; - assert.commandWorked(fromShard.adminCommand({configureFailPoint: failpoint, mode: 'alwaysOn'})); + let fp = configureFailPoint(fromShard, failpoint); let awaitShell = startParallelShell(codeToRunInParallelShell, st.s.port); jsTestLog("Waiting for failpoint " + failpoint); - waitForFailpoint("Hit " + failpoint, 1); + fp.wait(); clearRawMongoProgramOutput(); // Test DML @@ -216,21 +219,24 @@ function testMovePrimary(failpoint, fromShard, toShard, db, shouldFail, sharded) } }); - assert.commandWorked(fromShard.adminCommand({configureFailPoint: failpoint, mode: 'off'})); + fp.off(); awaitShell(); } function testMovePrimaryDDL(failpoint, fromShard, toShard, db, shouldFail, sharded) { + jsTest.log("Testing move primary DDL with FP: " + failpoint + " shouldFail: " + shouldFail + + " sharded: " + sharded); + let codeToRunInParallelShell = '{ db.getSiblingDB("admin").runCommand({movePrimary: "' + dbName + '", to: "' + toShard.name + '"}); }'; - assert.commandWorked(fromShard.adminCommand({configureFailPoint: failpoint, mode: 'alwaysOn'})); + let fp = configureFailPoint(fromShard, failpoint); let awaitShell = startParallelShell(codeToRunInParallelShell, st.s.port); jsTestLog("Waiting for failpoint " + failpoint); - waitForFailpoint("Hit " + failpoint, 1); + fp.wait(); clearRawMongoProgramOutput(); let collName; @@ -253,11 +259,24 @@ function testMovePrimaryDDL(failpoint, fromShard, toShard, db, shouldFail, shard } }); - assert.commandWorked(fromShard.adminCommand({configureFailPoint: failpoint, mode: 'off'})); + fp.off(); awaitShell(); } +// Reduce DDL lock timeout to half a second to speedup testing command that are expected to fail +// with lockbusy error +let overrideDDLLockTimeoutFPs = []; +st.forEachConnection(shard => { + try { + overrideDDLLockTimeoutFPs.push( + configureFailPoint(shard, "overrideDDLLockTimeout", {'timeoutMillisecs': 500})); + } catch (e) { + // The failpoint has been added in 5.3 so multiversion suite will fail to set this failpoint + jsTestLog("Failed to override DDL lock timeout: " + e); + } +}); + createCollections(); let fromShard = st.getPrimaryShard(dbName); let toShard = st.getOther(fromShard); @@ -284,5 +303,7 @@ fromShard = st.getPrimaryShard(dbName); toShard = st.getOther(fromShard); testMovePrimary('hangInCleanStaleDataStage', fromShard, toShard, st.s.getDB(dbName), false, false); +overrideDDLLockTimeoutFPs.forEach(fp => fp.off()); + st.stop(); })(); diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp index 15e875918ac..e6da6be570a 100644 --- a/src/mongo/db/s/sharding_ddl_coordinator.cpp +++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp @@ -50,6 +50,7 @@ namespace mongo { MONGO_FAIL_POINT_DEFINE(hangBeforeRunningCoordinatorInstance); +MONGO_FAIL_POINT_DEFINE(overrideDDLLockTimeout); namespace { @@ -143,12 +144,23 @@ ExecutorFuture ShardingDDLCoordinator::_acquireLockAsync( const auto coorName = DDLCoordinatorType_serializer(_coordId.getOperationType()); - auto distLock = distLockManager->lockDirectLocally( - opCtx, resource, DistLockManager::kDefaultLockTimeout); + const auto lockTimeOut = [&]() -> Milliseconds { + if (auto sfp = overrideDDLLockTimeout.scoped(); MONGO_unlikely(sfp.isActive())) { + if (auto timeoutElem = sfp.getData()["timeoutMillisecs"]; timeoutElem.ok()) { + const auto timeoutMillisecs = Milliseconds(timeoutElem.safeNumberLong()); + LOGV2(6320700, + "Overriding DDL lock timeout", + "timeout"_attr = timeoutMillisecs); + return timeoutMillisecs; + } + } + return DistLockManager::kDefaultLockTimeout; + }(); + + auto distLock = distLockManager->lockDirectLocally(opCtx, resource, lockTimeOut); _scopedLocks.emplace(std::move(distLock)); - uassertStatusOK(distLockManager->lockDirect( - opCtx, resource, coorName, DistLockManager::kDefaultLockTimeout)); + uassertStatusOK(distLockManager->lockDirect(opCtx, resource, coorName, lockTimeOut)); }) .until([this](Status status) { return (!_recoveredFromDisk) || status.isOK(); }) .withBackoffBetweenIterations(kExponentialBackoff) -- cgit v1.2.1