summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@10gen.com>2019-01-15 13:44:53 -0500
committerMatthew Russotto <matthew.russotto@10gen.com>2019-01-15 13:50:47 -0500
commitd03e38b3766207db9dd8e1fb1c4bc40c9446df31 (patch)
tree7dcc62c5db3487ccea5c1a7234f990cd383470dd
parent764396a48f5a31b548eab9092967cac610c24b73 (diff)
downloadmongo-d03e38b3766207db9dd8e1fb1c4bc40c9446df31.tar.gz
SERVER-38755 Stop closing connections on stepdown gated by server parameter "closeConnectionsOnStepdown" flag.
-rw-r--r--jstests/replsets/disconnect_on_legacy_write_to_secondary.js13
-rw-r--r--jstests/replsets/no_disconnect_on_stepdown.js90
-rw-r--r--src/mongo/db/read_concern_mongod.cpp10
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp10
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h2
5 files changed, 112 insertions, 13 deletions
diff --git a/jstests/replsets/disconnect_on_legacy_write_to_secondary.js b/jstests/replsets/disconnect_on_legacy_write_to_secondary.js
index 15ae6d798c0..d3456898945 100644
--- a/jstests/replsets/disconnect_on_legacy_write_to_secondary.js
+++ b/jstests/replsets/disconnect_on_legacy_write_to_secondary.js
@@ -6,7 +6,8 @@
load("jstests/libs/check_log.js");
- const rst = new ReplSetTest({nodes: [{}, {rsConfig: {priority: 0}}]});
+ const rst = new ReplSetTest(
+ {nodes: [{setParameter: {closeConnectionsOnStepdown: false}}, {rsConfig: {priority: 0}}]});
rst.startSet();
rst.initiate();
@@ -60,13 +61,7 @@
operation();
checkLog.contains(primary, failpoint + " fail point enabled");
jsTestLog("Within " + description + ": stepping down and disabling failpoint");
- try {
- assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true}));
- } catch (ex) {
- // TODO(SERVER-38755): Remove this as stepdown should not hang up the command
- // connection.
- assert(isNetworkError(ex));
- }
+ assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true}));
rst.waitForState(primary, ReplSetTest.State.SECONDARY);
assert.commandWorked(
primaryAdmin.adminCommand({configureFailPoint: failpoint, mode: "off"}));
@@ -75,7 +70,7 @@
// We should automatically reconnect after the failed command.
assert.commandWorked(primaryDb.adminCommand({ping: 1}));
// Allow the primary to be re-elected, and wait for it.
- primaryAdmin.adminCommand({replSetFreeze: 0});
+ assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0}));
rst.getPrimary();
}
runStepDownTest({
diff --git a/jstests/replsets/no_disconnect_on_stepdown.js b/jstests/replsets/no_disconnect_on_stepdown.js
new file mode 100644
index 00000000000..d21f9785d6a
--- /dev/null
+++ b/jstests/replsets/no_disconnect_on_stepdown.js
@@ -0,0 +1,90 @@
+/**
+ * Tests that stepdown terminates writes, but does not disconnect connections.
+ */
+(function() {
+ "use strict";
+
+ load("jstests/libs/check_log.js");
+
+ const rst = new ReplSetTest(
+ {nodes: [{setParameter: {closeConnectionsOnStepdown: false}}, {rsConfig: {priority: 0}}]});
+ rst.startSet();
+ rst.initiate();
+
+ const primary = rst.getPrimary();
+ const primaryAdmin = primary.getDB("admin");
+ // We need a separate connection to avoid interference with the ReplSetTestMechanism.
+ const primaryDataConn = new Mongo(primary.host);
+ const primaryDb = primaryDataConn.getDB("test");
+ const collname = "no_disconnect_on_stepdown";
+ const coll = primaryDb[collname];
+
+ // Never retry on network error, because this test needs to detect the network error.
+ TestData.skipRetryOnNetworkError = true;
+
+ // Legacy writes will still disconnect, so don't use them.
+ primaryDataConn.forceWriteMode('commands');
+
+ assert.commandWorked(coll.insert([{_id: 'deleteme'}, {_id: 'updateme'}, {_id: 'findme'}]));
+ rst.awaitReplication();
+
+ jsTestLog("Stepping down with no command in progress. Should not disconnect.");
+ // If the 'primary' connection is broken on stepdown, this command will fail.
+ assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true}));
+ rst.waitForState(primary, ReplSetTest.State.SECONDARY);
+ // If the 'primaryDataConn' connection was broken during stepdown, this command will fail.
+ assert.commandWorked(primaryDb.adminCommand({ping: 1}));
+ // Allow the primary to be re-elected, and wait for it.
+ assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0}));
+ rst.getPrimary();
+
+ function runStepDownTest({description, failpoint, operation, errorCode}) {
+ jsTestLog(`Trying ${description} on a stepping-down primary`);
+ assert.commandWorked(
+ primaryAdmin.adminCommand({configureFailPoint: failpoint, mode: "alwaysOn"}));
+
+ errorCode = errorCode || ErrorCodes.InterruptedDueToStepDown;
+ const writeCommand = `db.getMongo().forceWriteMode("commands");
+ assert.commandFailedWithCode(${operation}, ${errorCode});
+ assert.commandWorked(db.adminCommand({ping:1}));`;
+
+ const waitForShell = startParallelShell(writeCommand, primary.port);
+ checkLog.contains(primary, failpoint + " fail point enabled");
+ assert.commandWorked(primaryAdmin.adminCommand({replSetStepDown: 60, force: true}));
+ rst.waitForState(primary, ReplSetTest.State.SECONDARY);
+ assert.commandWorked(
+ primaryAdmin.adminCommand({configureFailPoint: failpoint, mode: "off"}));
+ try {
+ waitForShell();
+ } catch (ex) {
+ print("Failed trying to write or ping in " + description + ", possibly disconnected.");
+ throw ex;
+ }
+
+ // Allow the primary to be re-elected, and wait for it.
+ assert.commandWorked(primaryAdmin.adminCommand({replSetFreeze: 0}));
+ rst.getPrimary();
+ }
+ runStepDownTest({
+ description: "insert",
+ failpoint: "hangDuringBatchInsert",
+ operation: "db['" + collname + "'].insert({id:0})"
+ });
+ runStepDownTest({
+ description: "update",
+ failpoint: "hangDuringBatchUpdate",
+ operation: "db['" + collname + "'].update({_id: 'updateme'}, {'$set': {x: 1}})"
+ });
+ runStepDownTest({
+ description: "remove",
+ failpoint: "hangDuringBatchRemove",
+ operation: "db['" + collname + "'].remove({_id: 'deleteme'}, {'$set': {x: 1}})"
+ });
+ runStepDownTest({
+ description: "linearizable read",
+ failpoint: "hangBeforeLinearizableReadConcern",
+ operation: "db.runCommand({find: '" + collname +
+ "', filter: {'_id': 'findme'}, readConcern: {level: 'linearizable'}})",
+ });
+ rst.stopSet();
+})();
diff --git a/src/mongo/db/read_concern_mongod.cpp b/src/mongo/db/read_concern_mongod.cpp
index 0967968159b..fb41d7122c6 100644
--- a/src/mongo/db/read_concern_mongod.cpp
+++ b/src/mongo/db/read_concern_mongod.cpp
@@ -34,6 +34,7 @@
#include "mongo/base/status.h"
#include "mongo/db/concurrency/d_concurrency.h"
#include "mongo/db/concurrency/write_conflict_exception.h"
+#include "mongo/db/curop_failpoint_helpers.h"
#include "mongo/db/logical_clock.h"
#include "mongo/db/op_observer.h"
#include "mongo/db/operation_context.h"
@@ -51,6 +52,8 @@ namespace mongo {
namespace {
+MONGO_FAIL_POINT_DEFINE(hangBeforeLinearizableReadConcern);
+
/**
* Synchronize writeRequests
*/
@@ -344,6 +347,12 @@ MONGO_REGISTER_SHIM(waitForReadConcern)
MONGO_REGISTER_SHIM(waitForLinearizableReadConcern)(OperationContext* opCtx)->Status {
+ CurOpFailpointHelpers::waitWhileFailPointEnabled(
+ &hangBeforeLinearizableReadConcern, opCtx, "hangBeforeLinearizableReadConcern", [opCtx]() {
+ log() << "batch update - hangBeforeLinearizableReadConcern fail point enabled. "
+ "Blocking until fail point is disabled.";
+ });
+
repl::ReplicationCoordinator* replCoord =
repl::ReplicationCoordinator::get(opCtx->getClient()->getServiceContext());
@@ -370,6 +379,7 @@ MONGO_REGISTER_SHIM(waitForLinearizableReadConcern)(OperationContext* opCtx)->St
repl::OpTime lastOpApplied = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp();
auto awaitReplResult = replCoord->awaitReplication(opCtx, lastOpApplied, wc);
+
if (awaitReplResult.status == ErrorCodes::WriteConcernFailed) {
return Status(ErrorCodes::LinearizableReadConcernError,
"Failed to confirm that read was linearizable.");
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 65ecfe92aa5..287cf9bae41 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -97,6 +97,8 @@ MONGO_FAIL_POINT_DEFINE(stepdownHangBeforePerformingPostMemberStateUpdateActions
MONGO_FAIL_POINT_DEFINE(transitionToPrimaryHangBeforeTakingGlobalExclusiveLock);
MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp);
+MONGO_EXPORT_SERVER_PARAMETER(closeConnectionsOnStepdown, bool, true);
+
using CallbackArgs = executor::TaskExecutor::CallbackArgs;
using CallbackFn = executor::TaskExecutor::CallbackFn;
using CallbackHandle = executor::TaskExecutor::CallbackHandle;
@@ -2591,7 +2593,7 @@ ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator(WithLock l
invariant(!_readWriteAbility->canAcceptNonLocalWrites(lk));
serverGlobalParams.validateFeaturesAsMaster.store(false);
- result = kActionCloseAllConnections;
+ result = kActionSteppedDownOrRemoved;
} else {
result = kActionFollowerModeStateChange;
}
@@ -2696,8 +2698,10 @@ void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction(
case kActionFollowerModeStateChange:
_onFollowerModeStateChange();
break;
- case kActionCloseAllConnections:
- _externalState->closeConnections();
+ case kActionSteppedDownOrRemoved:
+ if (closeConnectionsOnStepdown.load()) {
+ _externalState->closeConnections();
+ }
_externalState->shardingOnStepDownHook();
_externalState->stopNoopWriter();
break;
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index ab6e2c4dd62..4e81b40b93a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -439,7 +439,7 @@ private:
*/
enum PostMemberStateUpdateAction {
kActionNone,
- kActionCloseAllConnections, // Also indicates that we should clear sharding state.
+ kActionSteppedDownOrRemoved,
kActionFollowerModeStateChange,
kActionStartSingleNodeElection
};