summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@mongodb.com>2021-11-18 21:57:48 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-11-19 04:09:19 +0000
commit63b46f14867df2de8f823b9eb1b65bda3ac4b1c2 (patch)
tree0a7e1a2b82a3a1e64387a90f69e0ce88c4250937
parent86b8db8755d10f86a28a843f501007fe05ff0324 (diff)
downloadmongo-63b46f14867df2de8f823b9eb1b65bda3ac4b1c2.tar.gz
SERVER-61593 Test command waitForMemberState should be interruptible
-rw-r--r--buildscripts/resmokelib/testing/hooks/initialsync.py14
-rw-r--r--buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py44
-rw-r--r--src/mongo/db/repl/repl_set_commands.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator.h5
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp5
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h4
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp3
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_test.cpp45
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.cpp3
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.h4
-rw-r--r--src/mongo/db/repl/replication_coordinator_noop.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_noop.h2
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.cpp4
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.h2
14 files changed, 97 insertions, 42 deletions
diff --git a/buildscripts/resmokelib/testing/hooks/initialsync.py b/buildscripts/resmokelib/testing/hooks/initialsync.py
index 171787fe900..2da12b17bfe 100644
--- a/buildscripts/resmokelib/testing/hooks/initialsync.py
+++ b/buildscripts/resmokelib/testing/hooks/initialsync.py
@@ -60,6 +60,8 @@ class BackgroundInitialSyncTestCase(jsfile.DynamicJSTestCase):
"""BackgroundInitialSyncTestCase class."""
JS_FILENAME = os.path.join("jstests", "hooks", "run_initial_sync_node_validation.js")
+ INTERRUPTED_DUE_TO_REPL_STATE_CHANGE = 11602
+ INTERRUPTED_DUE_TO_STORAGE_CHANGE = 355
def __init__( # pylint: disable=too-many-arguments
self, logger, test_name, description, base_test_name, hook, shell_options=None):
@@ -83,7 +85,17 @@ class BackgroundInitialSyncTestCase(jsfile.DynamicJSTestCase):
[("replSetTest", 1), ("waitForMemberState", 2),
("timeoutMillis",
fixture_interface.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60 * 1000)])
- sync_node_conn.admin.command(cmd)
+ while True:
+ try:
+ sync_node_conn.admin.command(cmd)
+ break
+ except pymongo.errors.OperationFailure as err:
+ if (err.code != self.INTERRUPTED_DUE_TO_REPL_STATE_CHANGE
+ and err.code != self.INTERRUPTED_DUE_TO_STORAGE_CHANGE):
+ raise
+ msg = ("Interrupted while waiting for node to reach secondary state, retrying: {}"
+ ).format(err)
+ self.logger.error(msg)
# Check if the initial sync node is in SECONDARY state. If it's been 'n' tests, then it
# should have waited to be in SECONDARY state and the test should be marked as a failure.
diff --git a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py
index 8e70da8a09f..b0ff858c5dc 100644
--- a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py
+++ b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py
@@ -123,6 +123,9 @@ class PeriodicKillSecondaries(interface.Hook):
class PeriodicKillSecondariesTestCase(interface.DynamicTestCase):
"""PeriodicKillSecondariesTestCase class."""
+ INTERRUPTED_DUE_TO_REPL_STATE_CHANGE = 11602
+ INTERRUPTED_DUE_TO_STORAGE_CHANGE = 355
+
def __init__( # pylint: disable=too-many-arguments
self, logger, test_name, description, base_test_name, hook, test_report):
"""Initialize PeriodicKillSecondariesTestCase."""
@@ -434,19 +437,28 @@ class PeriodicKillSecondariesTestCase(interface.DynamicTestCase):
def _await_secondary_state(self, secondary):
client = secondary.mongo_client()
- try:
- client.admin.command(
- bson.SON([
- ("replSetTest", 1),
- ("waitForMemberState", 2), # 2 = SECONDARY
- ("timeoutMillis",
- fixture.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60 * 1000)
- ]))
- except pymongo.errors.OperationFailure as err:
- self.logger.exception(
- "mongod on port %d failed to reach state SECONDARY after %d seconds",
- secondary.port, fixture.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60)
- raise errors.ServerFailure(
- "mongod on port {} failed to reach state SECONDARY after {} seconds: {}".format(
- secondary.port, fixture.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60,
- err.args[0]))
+ while True:
+ try:
+ client.admin.command(
+ bson.SON([
+ ("replSetTest", 1),
+ ("waitForMemberState", 2), # 2 = SECONDARY
+ ("timeoutMillis",
+ fixture.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60 * 1000)
+ ]))
+ break
+ except pymongo.errors.OperationFailure as err:
+ if (err.code != self.INTERRUPTED_DUE_TO_REPL_STATE_CHANGE
+ and err.code != self.INTERRUPTED_DUE_TO_STORAGE_CHANGE):
+ self.logger.exception(
+ "mongod on port %d failed to reach state SECONDARY after %d seconds",
+ secondary.port, fixture.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60)
+ raise errors.ServerFailure(
+ "mongod on port {} failed to reach state SECONDARY after {} seconds: {}".
+ format(secondary.port,
+ fixture.ReplFixture.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60,
+ err.args[0]))
+
+ msg = ("Interrupted while waiting for node to reach secondary state, retrying: {}"
+ ).format(err)
+ self.logger.error(msg)
diff --git a/src/mongo/db/repl/repl_set_commands.cpp b/src/mongo/db/repl/repl_set_commands.cpp
index 41ae248bf07..7299875254e 100644
--- a/src/mongo/db/repl/repl_set_commands.cpp
+++ b/src/mongo/db/repl/repl_set_commands.cpp
@@ -129,7 +129,7 @@ public:
"timeout"_attr = timeout,
"expectedState"_attr = expectedState);
- status = replCoord->waitForMemberState(expectedState, timeout);
+ status = replCoord->waitForMemberState(opCtx, expectedState, timeout);
uassertStatusOK(status);
return true;
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h
index 06810324584..e1a49788d4f 100644
--- a/src/mongo/db/repl/replication_coordinator.h
+++ b/src/mongo/db/repl/replication_coordinator.h
@@ -196,8 +196,11 @@ public:
* Returns OK if member state is 'state'.
* Returns ErrorCodes::ExceededTimeLimit if we timed out waiting for the state change.
* Returns ErrorCodes::BadValue if timeout is negative.
+ * Throws if interrupted (pass Interruptible::notInterruptible() if interruption is not desired.
*/
- virtual Status waitForMemberState(MemberState expectedState, Milliseconds timeout) = 0;
+ virtual Status waitForMemberState(Interruptible* interruptible,
+ MemberState expectedState,
+ Milliseconds timeout) = 0;
/**
* Returns true if this node is in state PRIMARY or SECONDARY.
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index ded42849496..4a7c66832d1 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1074,7 +1074,8 @@ MemberState ReplicationCoordinatorImpl::_getMemberState_inlock() const {
return _memberState;
}
-Status ReplicationCoordinatorImpl::waitForMemberState(MemberState expectedState,
+Status ReplicationCoordinatorImpl::waitForMemberState(Interruptible* interruptible,
+ MemberState expectedState,
Milliseconds timeout) {
if (timeout < Milliseconds(0)) {
return Status(ErrorCodes::BadValue, "Timeout duration cannot be negative");
@@ -1082,7 +1083,7 @@ Status ReplicationCoordinatorImpl::waitForMemberState(MemberState expectedState,
stdx::unique_lock<Latch> lk(_mutex);
auto pred = [this, expectedState]() { return _memberState == expectedState; };
- if (!_memberStateChange.wait_for(lk, timeout.toSystemDuration(), pred)) {
+ if (!interruptible->waitForConditionOrInterruptFor(_memberStateChange, lk, timeout, pred)) {
return Status(ErrorCodes::ExceededTimeLimit,
str::stream()
<< "Timed out waiting for state to become " << expectedState.toString()
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index dae323acb51..0119194b0dc 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -126,7 +126,9 @@ public:
virtual bool canAcceptNonLocalWrites() const override;
- virtual Status waitForMemberState(MemberState expectedState, Milliseconds timeout) override;
+ virtual Status waitForMemberState(Interruptible* interruptible,
+ MemberState expectedState,
+ Milliseconds timeout) override;
virtual bool isInPrimaryOrSecondaryState(OperationContext* opCtx) const override;
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
index 2e08f3890d0..d6b75739267 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp
@@ -1520,7 +1520,8 @@ void HBStepdownAndReconfigTest::assertSteppedDown() {
LOGV2(463811, "Waiting for step down to complete");
// Wait for step down to finish since it may be asynchronous.
auto timeout = Milliseconds(5 * 60 * 1000);
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_SECONDARY, timeout));
+ ASSERT_OK(getReplCoord()->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_SECONDARY, timeout));
// Primary stepped down.
ASSERT_EQUALS(2, getReplCoord()->getTerm());
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index ffbd17f7c82..9a2e2b84c5c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -3092,7 +3092,8 @@ TEST_F(ReplCoordTest, DoNotAllowMaintenanceModeWhilePrimary) {
// Step down from primary.
getReplCoord()->updateTerm(opCtx.get(), getReplCoord()->getTerm() + 1).transitional_ignore();
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_SECONDARY, Seconds(1)));
+ ASSERT_OK(
+ getReplCoord()->waitForMemberState(opCtx.get(), MemberState::RS_SECONDARY, Seconds(1)));
status = getReplCoord()->setMaintenanceMode(opCtx.get(), false);
ASSERT_EQUALS(ErrorCodes::OperationFailed, status);
@@ -3125,7 +3126,7 @@ TEST_F(ReplCoordTest, DoNotAllowSettingMaintenanceModeWhileConductingAnElection)
// Step down from primary.
getReplCoord()->updateTerm(opCtx.get(), getReplCoord()->getTerm() + 1).transitional_ignore();
getReplCoord()
- ->waitForMemberState(MemberState::RS_SECONDARY, Milliseconds(10 * 1000))
+ ->waitForMemberState(opCtx.get(), MemberState::RS_SECONDARY, Milliseconds(10 * 1000))
.transitional_ignore();
// Can't modify maintenance mode when running for election (before and after dry run).
@@ -3739,7 +3740,8 @@ TEST_F(ReplCoordTest, HelloReturnsErrorInQuiesceModeWhenNodeIsRemoved) {
exitNetwork();
// Wait for the node to be removed. Test that we increment the topology version.
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_REMOVED, Seconds(1)));
+ ASSERT_OK(getReplCoord()->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_REMOVED, Seconds(1)));
ASSERT_EQUALS(removedFromConfig.getConfigVersion(), getReplCoord()->getConfigVersion());
auto topologyVersionAfterRemoved = getTopoCoord().getTopologyVersion();
ASSERT_EQUALS(topologyVersionAfterQuiesceMode.getCounter() + 1,
@@ -4392,7 +4394,8 @@ TEST_F(ReplCoordTest, HelloOnRemovedNode) {
exitNetwork();
// node1 no longer exists in the replica set config.
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_REMOVED, Seconds(1)));
+ ASSERT_OK(getReplCoord()->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_REMOVED, Seconds(1)));
ASSERT_EQUALS(removedFromConfig.getConfigVersion(), getReplCoord()->getConfigVersion());
const auto maxAwaitTime = Milliseconds(5000);
@@ -4553,7 +4556,7 @@ TEST_F(ReplCoordTest, AwaitHelloRespondsCorrectlyWhenNodeRemovedAndReadded) {
exitNetwork();
// node1 no longer exists in the replica set config.
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_REMOVED, Seconds(1)));
+ ASSERT_OK(getReplCoord()->waitForMemberState(opCtx.get(), MemberState::RS_REMOVED, Seconds(1)));
ASSERT_EQUALS(removedFromConfig.getConfigVersion(), getReplCoord()->getConfigVersion());
getHelloWaitingForRemovedNodeThread.join();
const std::string newHorizonSniName = "newhorizon.com";
@@ -4593,7 +4596,8 @@ TEST_F(ReplCoordTest, AwaitHelloRespondsCorrectlyWhenNodeRemovedAndReadded) {
});
replyToReceivedHeartbeatV1();
reconfigThread.join();
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_SECONDARY, Seconds(1)));
+ ASSERT_OK(
+ getReplCoord()->waitForMemberState(opCtx.get(), MemberState::RS_SECONDARY, Seconds(1)));
getHelloThread.join();
stdx::thread getHelloThreadNewHorizon([&] {
@@ -6605,7 +6609,8 @@ TEST_F(ReplCoordTest,
net->runReadyNetworkOperations();
net->exitNetwork();
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_REMOVED, Seconds(1)));
+ ASSERT_OK(getReplCoord()->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_REMOVED, Seconds(1)));
ASSERT_EQUALS(config.getConfigVersion(), getReplCoord()->getConfigVersion());
getReplCoord()->cancelAndRescheduleElectionTimeout();
@@ -7293,7 +7298,8 @@ TEST_F(ReplCoordTest, StepDownWhenHandleLivenessTimeoutMarksAMajorityOfVotingNod
}
getNet()->exitNetwork();
- ASSERT_OK(getReplCoord()->waitForMemberState(MemberState::RS_SECONDARY, Hours{1}));
+ ASSERT_OK(getReplCoord()->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_SECONDARY, Hours{1}));
}
TEST_F(ReplCoordTest, WaitForMemberState) {
@@ -7318,17 +7324,30 @@ TEST_F(ReplCoordTest, WaitForMemberState) {
ASSERT_EQUALS(initialTerm + 1, replCoord->getTerm());
auto timeout = Milliseconds(1);
- ASSERT_OK(replCoord->waitForMemberState(MemberState::RS_PRIMARY, timeout));
+ ASSERT_OK(replCoord->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_PRIMARY, timeout));
ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit,
- replCoord->waitForMemberState(MemberState::RS_REMOVED, timeout));
+ replCoord->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_REMOVED, timeout));
ASSERT_EQUALS(ErrorCodes::BadValue,
- replCoord->waitForMemberState(MemberState::RS_PRIMARY, Milliseconds(-1)));
+ replCoord->waitForMemberState(Interruptible::notInterruptible(),
+ MemberState::RS_PRIMARY,
+ Milliseconds(-1)));
// Zero timeout is fine.
- ASSERT_OK(replCoord->waitForMemberState(MemberState::RS_PRIMARY, Milliseconds(0)));
+ ASSERT_OK(replCoord->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_PRIMARY, Milliseconds(0)));
ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit,
- replCoord->waitForMemberState(MemberState::RS_ARBITER, Milliseconds(0)));
+ replCoord->waitForMemberState(
+ Interruptible::notInterruptible(), MemberState::RS_ARBITER, Milliseconds(0)));
+
+ // Make sure it can be interrupted.
+ auto opCtx = makeOperationContext();
+ opCtx->markKilled(ErrorCodes::Interrupted);
+ ASSERT_THROWS(
+ replCoord->waitForMemberState(opCtx.get(), MemberState::RS_ARBITER, Milliseconds(0)),
+ ExceptionFor<ErrorCodes::Interrupted>);
}
TEST_F(
diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp
index 788f2991a70..88df8f224ec 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_mock.cpp
@@ -139,7 +139,8 @@ void ReplicationCoordinatorMock::setCanAcceptNonLocalWrites(bool canAcceptNonLoc
_canAcceptNonLocalWrites = canAcceptNonLocalWrites;
}
-Status ReplicationCoordinatorMock::waitForMemberState(MemberState expectedState,
+Status ReplicationCoordinatorMock::waitForMemberState(Interruptible* interruptible,
+ MemberState expectedState,
Milliseconds timeout) {
MONGO_UNREACHABLE;
return Status::OK();
diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h
index f20e2cadadc..60e48011f1e 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_mock.h
@@ -91,7 +91,9 @@ public:
virtual bool canAcceptNonLocalWrites() const;
- virtual Status waitForMemberState(MemberState expectedState, Milliseconds timeout) override;
+ virtual Status waitForMemberState(Interruptible* interruptible,
+ MemberState expectedState,
+ Milliseconds timeout) override;
virtual bool isInPrimaryOrSecondaryState(OperationContext* opCtx) const;
diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp
index 655a5f48904..8429ac7e568 100644
--- a/src/mongo/db/repl/replication_coordinator_noop.cpp
+++ b/src/mongo/db/repl/replication_coordinator_noop.cpp
@@ -165,7 +165,7 @@ bool ReplicationCoordinatorNoOp::canAcceptNonLocalWrites() const {
MONGO_UNREACHABLE;
}
-Status ReplicationCoordinatorNoOp::waitForMemberState(MemberState, Milliseconds) {
+Status ReplicationCoordinatorNoOp::waitForMemberState(Interruptible*, MemberState, Milliseconds) {
MONGO_UNREACHABLE;
}
diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h
index 7183fdaac4a..74ef742d088 100644
--- a/src/mongo/db/repl/replication_coordinator_noop.h
+++ b/src/mongo/db/repl/replication_coordinator_noop.h
@@ -101,7 +101,7 @@ public:
std::vector<MemberData> getMemberData() const final;
- Status waitForMemberState(MemberState, Milliseconds) final;
+ Status waitForMemberState(Interruptible*, MemberState, Milliseconds) final;
Seconds getSecondaryDelaySecs() const final;
diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp
index 5a117927635..7e574d26f2c 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.cpp
+++ b/src/mongo/embedded/replication_coordinator_embedded.cpp
@@ -170,7 +170,9 @@ bool ReplicationCoordinatorEmbedded::canAcceptNonLocalWrites() const {
UASSERT_NOT_IMPLEMENTED;
}
-Status ReplicationCoordinatorEmbedded::waitForMemberState(MemberState, Milliseconds) {
+Status ReplicationCoordinatorEmbedded::waitForMemberState(Interruptible*,
+ MemberState,
+ Milliseconds) {
UASSERT_NOT_IMPLEMENTED;
}
diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h
index 9a408d645fd..cf2287dc478 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.h
+++ b/src/mongo/embedded/replication_coordinator_embedded.h
@@ -105,7 +105,7 @@ public:
std::vector<repl::MemberData> getMemberData() const override;
- Status waitForMemberState(repl::MemberState, Milliseconds) override;
+ Status waitForMemberState(Interruptible*, repl::MemberState, Milliseconds) override;
Seconds getSecondaryDelaySecs() const override;