diff options
author | Matthew Russotto <matthew.russotto@10gen.com> | 2019-01-15 13:44:53 -0500 |
---|---|---|
committer | Matthew Russotto <matthew.russotto@10gen.com> | 2019-01-15 13:50:47 -0500 |
commit | d03e38b3766207db9dd8e1fb1c4bc40c9446df31 (patch) | |
tree | 7dcc62c5db3487ccea5c1a7234f990cd383470dd | |
parent | 764396a48f5a31b548eab9092967cac610c24b73 (diff) | |
download | mongo-d03e38b3766207db9dd8e1fb1c4bc40c9446df31.tar.gz |
SERVER-38755 Stop closing connections on stepdown gated by server parameter "closeConnectionsOnStepdown" flag.
-rw-r--r-- | jstests/replsets/disconnect_on_legacy_write_to_secondary.js | 13 | ||||
-rw-r--r-- | jstests/replsets/no_disconnect_on_stepdown.js | 90 | ||||
-rw-r--r-- | src/mongo/db/read_concern_mongod.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 2 |
5 files changed, 112 insertions, 13 deletions
diff --git a/jstests/replsets/disconnect_on_legacy_write_to_secondary.js b/jstests/replsets/disconnect_on_legacy_write_to_secondary.js index 15ae6d798c0..d3456898945 100644 --- a/jstests/replsets/disconnect_on_legacy_write_to_secondary.js +++ b/jstests/replsets/disconnect_on_legacy_write_to_secondary.js @@ -6,7 +6,8 @@ load("jstests/libs/check_log.js"); - const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0}}]}); + const rst = new ReplSetTest( + {nodes: [{setParameter: {closeConnectionsOnStepdown: false}}, {rsConfig: {priority: 0}}]}); rst.startSet(); rst.initiate(); @@ -60,13 +61,7 @@ operation(); checkLog.contains(primary, failpoint + " fail point enabled"); jsTestLog("Within " + description + ": stepping down and disabling failpoint"); - try { - assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true})); - } catch (ex) { - // TODO(SERVER-38755): Remove this as stepdown should not hang up the command - // connection. - assert(isNetworkError(ex)); - } + assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true})); rst.waitForState(primary, ReplSetTest.State.SECONDARY); assert.commandWorked( primaryAdmin.adminCommand({configureFailPoint: failpoint, mode: "off"})); @@ -75,7 +70,7 @@ // We should automatically reconnect after the failed command. assert.commandWorked(primaryDb.adminCommand({ping: 1})); // Allow the primary to be re-elected, and wait for it. - primaryAdmin.adminCommand({replSetFreeze: 0}); + assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0})); rst.getPrimary(); } runStepDownTest({ diff --git a/jstests/replsets/no_disconnect_on_stepdown.js b/jstests/replsets/no_disconnect_on_stepdown.js new file mode 100644 index 00000000000..d21f9785d6a --- /dev/null +++ b/jstests/replsets/no_disconnect_on_stepdown.js @@ -0,0 +1,90 @@ +/** + * Tests that stepdown terminates writes, but does not disconnect connections. + */ +(function() { + "use strict"; + + load("jstests/libs/check_log.js"); + + const rst = new ReplSetTest( + {nodes: [{setParameter: {closeConnectionsOnStepdown: false}}, {rsConfig: {priority: 0}}]}); + rst.startSet(); + rst.initiate(); + + const primary = rst.getPrimary(); + const primaryAdmin = primary.getDB("admin"); + // We need a separate connection to avoid interference with the ReplSetTestMechanism. + const primaryDataConn = new Mongo(primary.host); + const primaryDb = primaryDataConn.getDB("test"); + const collname = "no_disconnect_on_stepdown"; + const coll = primaryDb[collname]; + + // Never retry on network error, because this test needs to detect the network error. + TestData.skipRetryOnNetworkError = true; + + // Legacy writes will still disconnect, so don't use them. + primaryDataConn.forceWriteMode('commands'); + + assert.commandWorked(coll.insert([{_id: 'deleteme'}, {_id: 'updateme'}, {_id: 'findme'}])); + rst.awaitReplication(); + + jsTestLog("Stepping down with no command in progress. Should not disconnect."); + // If the 'primary' connection is broken on stepdown, this command will fail. + assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true})); + rst.waitForState(primary, ReplSetTest.State.SECONDARY); + // If the 'primaryDataConn' connection was broken during stepdown, this command will fail. + assert.commandWorked(primaryDb.adminCommand({ping: 1})); + // Allow the primary to be re-elected, and wait for it. + assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0})); + rst.getPrimary(); + + function runStepDownTest({description, failpoint, operation, errorCode}) { + jsTestLog(`Trying ${description} on a stepping-down primary`); + assert.commandWorked( + primaryAdmin.adminCommand({configureFailPoint: failpoint, mode: "alwaysOn"})); + + errorCode = errorCode || ErrorCodes.InterruptedDueToStepDown; + const writeCommand = `db.getMongo().forceWriteMode("commands"); + assert.commandFailedWithCode(${operation}, ${errorCode}); + assert.commandWorked(db.adminCommand({ping:1}));`; + + const waitForShell = startParallelShell(writeCommand, primary.port); + checkLog.contains(primary, failpoint + " fail point enabled"); + assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true})); + rst.waitForState(primary, ReplSetTest.State.SECONDARY); + assert.commandWorked( + primaryAdmin.adminCommand({configureFailPoint: failpoint, mode: "off"})); + try { + waitForShell(); + } catch (ex) { + print("Failed trying to write or ping in " + description + ", possibly disconnected."); + throw ex; + } + + // Allow the primary to be re-elected, and wait for it. + assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0})); + rst.getPrimary(); + } + runStepDownTest({ + description: "insert", + failpoint: "hangDuringBatchInsert", + operation: "db['" + collname + "'].insert({id:0})" + }); + runStepDownTest({ + description: "update", + failpoint: "hangDuringBatchUpdate", + operation: "db['" + collname + "'].update({_id: 'updateme'}, {'$set': {x: 1}})" + }); + runStepDownTest({ + description: "remove", + failpoint: "hangDuringBatchRemove", + operation: "db['" + collname + "'].remove({_id: 'deleteme'}, {'$set': {x: 1}})" + }); + runStepDownTest({ + description: "linearizable read", + failpoint: "hangBeforeLinearizableReadConcern", + operation: "db.runCommand({find: '" + collname + + "', filter: {'_id': 'findme'}, readConcern: {level: 'linearizable'}})", + }); + rst.stopSet(); +})(); diff --git a/src/mongo/db/read_concern_mongod.cpp b/src/mongo/db/read_concern_mongod.cpp index 0967968159b..fb41d7122c6 100644 --- a/src/mongo/db/read_concern_mongod.cpp +++ b/src/mongo/db/read_concern_mongod.cpp @@ -34,6 +34,7 @@ #include "mongo/base/status.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/concurrency/write_conflict_exception.h" +#include "mongo/db/curop_failpoint_helpers.h" #include "mongo/db/logical_clock.h" #include "mongo/db/op_observer.h" #include "mongo/db/operation_context.h" @@ -51,6 +52,8 @@ namespace mongo { namespace { +MONGO_FAIL_POINT_DEFINE(hangBeforeLinearizableReadConcern); + /** * Synchronize writeRequests */ @@ -344,6 +347,12 @@ MONGO_REGISTER_SHIM(waitForReadConcern) MONGO_REGISTER_SHIM(waitForLinearizableReadConcern)(OperationContext* opCtx)->Status { + CurOpFailpointHelpers::waitWhileFailPointEnabled( + &hangBeforeLinearizableReadConcern, opCtx, "hangBeforeLinearizableReadConcern", [opCtx]() { + log() << "batch update - hangBeforeLinearizableReadConcern fail point enabled. " + "Blocking until fail point is disabled."; + }); + repl::ReplicationCoordinator* replCoord = repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext()); @@ -370,6 +379,7 @@ MONGO_REGISTER_SHIM(waitForLinearizableReadConcern)(OperationContext* opCtx)->St repl::OpTime lastOpApplied = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); auto awaitReplResult = replCoord->awaitReplication(opCtx, lastOpApplied, wc); + if (awaitReplResult.status == ErrorCodes::WriteConcernFailed) { return Status(ErrorCodes::LinearizableReadConcernError, "Failed to confirm that read was linearizable."); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 65ecfe92aa5..287cf9bae41 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -97,6 +97,8 @@ MONGO_FAIL_POINT_DEFINE(stepdownHangBeforePerformingPostMemberStateUpdateActions MONGO_FAIL_POINT_DEFINE(transitionToPrimaryHangBeforeTakingGlobalExclusiveLock); MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp); +MONGO_EXPORT_SERVER_PARAMETER(closeConnectionsOnStepdown, bool, true); + using CallbackArgs = executor::TaskExecutor::CallbackArgs; using CallbackFn = executor::TaskExecutor::CallbackFn; using CallbackHandle = executor::TaskExecutor::CallbackHandle; @@ -2591,7 +2593,7 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk)); serverGlobalParams.validateFeaturesAsMaster.store(false); - result = kActionCloseAllConnections; + result = kActionSteppedDownOrRemoved; } else { result = kActionFollowerModeStateChange; } @@ -2696,8 +2698,10 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( case kActionFollowerModeStateChange: _onFollowerModeStateChange(); break; - case kActionCloseAllConnections: - _externalState->closeConnections(); + case kActionSteppedDownOrRemoved: + if (closeConnectionsOnStepdown.load()) { + _externalState->closeConnections(); + } _externalState->shardingOnStepDownHook(); _externalState->stopNoopWriter(); break; diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index ab6e2c4dd62..4e81b40b93a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -439,7 +439,7 @@ private: */ enum PostMemberStateUpdateAction { kActionNone, - kActionCloseAllConnections, // Also indicates that we should clear sharding state. + kActionSteppedDownOrRemoved, kActionFollowerModeStateChange, kActionStartSingleNodeElection }; |