From d7d6a14d30d8cd5b9ecc9d0bb74a959ab9349b61 Mon Sep 17 00:00:00 2001 From: Vesselina Ratcheva Date: Mon, 11 May 2020 19:58:09 -0400 Subject: SERVER-46357 Make it clear in currentOp when an automatic reconfig is happening --- .../currentOp_during_automatic_reconfig.js | 96 ++++++++++++++++++++++ src/mongo/db/repl/replication_coordinator_impl.cpp | 27 ++++++ 2 files changed, 123 insertions(+) create mode 100644 jstests/replsets/currentOp_during_automatic_reconfig.js diff --git a/jstests/replsets/currentOp_during_automatic_reconfig.js b/jstests/replsets/currentOp_during_automatic_reconfig.js new file mode 100644 index 00000000000..846ab3d0f50 --- /dev/null +++ b/jstests/replsets/currentOp_during_automatic_reconfig.js @@ -0,0 +1,96 @@ +/** + * Tests that currentOp displays information about in-progress automatic reconfigs. + * + * @tags: [ + * requires_fcv_46, + * ] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load('jstests/replsets/rslib.js'); + +const testName = jsTestName(); +const dbName = "testdb"; +const collName = "testcoll"; + +const rst = new ReplSetTest({ + name: testName, + nodes: [{}], + nodeOptions: {setParameter: {enableAutomaticReconfig: true}}, + settings: {chainingAllowed: false}, + useBridge: true +}); +rst.startSet(); +rst.initiateWithHighElectionTimeout(); + +const primary = rst.getPrimary(); +const primaryDb = primary.getDB(dbName); +const primaryColl = primaryDb.getCollection(collName); + +// TODO (SERVER-46808): Move this into ReplSetTest.initiate +waitForNewlyAddedRemovalForNodeToBeCommitted(primary, 0); +waitForConfigReplication(primary, rst.nodes); + +assert.commandWorked(primaryColl.insert({"starting": "doc"})); + +jsTestLog("Adding a new node to the replica set"); +const secondary = rst.add({ + rsConfig: {priority: 0}, + setParameter: { + 'failpoint.initialSyncHangBeforeFinish': tojson({mode: 'alwaysOn'}), + 'numInitialSyncAttempts': 1, + 'enableAutomaticReconfig': true, + } +}); +rst.reInitiate(); +assert.commandWorked(secondary.adminCommand({ + waitForFailPoint: "initialSyncHangBeforeFinish", + timesEntered: 1, + maxTimeMS: kDefaultWaitForFailPointTimeout +})); + +jsTestLog("Checking that the 'newlyAdded' field is set on the new node"); +assert(isMemberNewlyAdded(primary, 1)); + +jsTestLog("Allowing primary to initiate the 'newlyAdded' field removal"); +let hangDuringAutomaticReconfigFP = configureFailPoint(primaryDb, "hangDuringAutomaticReconfig"); +assert.commandWorked( + secondary.adminCommand({configureFailPoint: "initialSyncHangBeforeFinish", mode: "off"})); +rst.waitForState(secondary, ReplSetTest.State.SECONDARY); + +hangDuringAutomaticReconfigFP.wait(); + +jsTestLog("Looking for the automatic reconfig in the currentOp output"); +const curOpRes = assert.commandWorked(primaryDb.adminCommand({currentOp: 1})); + +const ops = curOpRes.inprog; +let found = false; +for (let i = 0; i < ops.length; i++) { + let op = ops[i]; + assert(op.hasOwnProperty("command"), op); + const commandField = op["command"]; + if (commandField.hasOwnProperty("replSetReconfig")) { + if (commandField["replSetReconfig"] === "automatic") { + assert(commandField.hasOwnProperty("configVersionAndTerm")); + assert(commandField.hasOwnProperty("memberId"), op); + assert.eq(1, commandField["memberId"], op); + + assert(op.hasOwnProperty("desc"), op); + assert(op["desc"].startsWith("ReplCoord")); // client name + + jsTestLog("Found automatic reconfig: " + tojson(op)); + found = true; + break; + } + } +} + +assert(found, ops); + +hangDuringAutomaticReconfigFP.off(); +waitForNewlyAddedRemovalForNodeToBeCommitted(primary, 1); +rst.stopSet(); +})(); \ No newline at end of file diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index bcae984ce91..c887ef0c442 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -55,6 +55,7 @@ #include "mongo/db/commands/test_commands_enabled.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/concurrency/replication_state_transition_lock_guard.h" +#include "mongo/db/curop.h" #include "mongo/db/curop_failpoint_helpers.h" #include "mongo/db/dbdirectclient.h" #include "mongo/db/index/index_descriptor.h" @@ -127,6 +128,8 @@ MONGO_FAIL_POINT_DEFINE(omitConfigQuorumCheck); // Will cause signal drain complete to hang after reconfig MONGO_FAIL_POINT_DEFINE(hangAfterReconfigOnDrainComplete); MONGO_FAIL_POINT_DEFINE(doNotRemoveNewlyAddedOnHeartbeats); +// Will hang right after setting the currentOp info associated with an automatic reconfig. +MONGO_FAIL_POINT_DEFINE(hangDuringAutomaticReconfig); // Number of times we tried to go live as a secondary. Counter64 attemptsToBecomeSecondary; @@ -3716,6 +3719,30 @@ void ReplicationCoordinatorImpl::_reconfigToRemoveNewlyAddedField( }; auto opCtx = cc().makeOperationContext(); + + // Set info for currentOp to display if called while this is still running. + { + stdx::unique_lock lk(*opCtx->getClient()); + auto curOp = CurOp::get(opCtx.get()); + curOp->setLogicalOp_inlock(LogicalOp::opCommand); + BSONObjBuilder bob; + bob.append("replSetReconfig", "automatic"); + bob.append("memberId", memberId.getData()); + bob.append("configVersionAndTerm", versionAndTerm.toString()); + bob.append("info", + "An automatic reconfig. Used to remove a 'newlyAdded' config field for a " + "replica set member."); + curOp->setOpDescription_inlock(bob.obj()); + curOp->setNS_inlock("local.system.replset"); + curOp->ensureStarted(); + } + + if (MONGO_unlikely(hangDuringAutomaticReconfig.shouldFail())) { + LOGV2(4635700, + "Failpoint 'hangDuringAutomaticReconfig' enabled. Blocking until it is disabled."); + hangDuringAutomaticReconfig.pauseWhileSet(); + } + auto status = doReplSetReconfig(opCtx.get(), getNewConfig, false /* force */); if (!status.isOK()) { -- cgit v1.2.1