summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Broadstone <mbroadst@mongodb.com>2022-08-31 13:30:01 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-08-31 14:00:31 +0000
commit54746da1485cfca77a1f1fd214ee83d3d6948d0d (patch)
tree338605f0da1493f35464f9ad34ef91bce964666a
parenta4bd3ce3607d2c3020d7efa3501240ae4b1a1b03 (diff)
downloadmongo-54746da1485cfca77a1f1fd214ee83d3d6948d0d.tar.gz
SERVER-69262 Unconditionally retry stepup and no-op write commands
-rw-r--r--jstests/serverless/shard_split_abort_while_committing.js5
-rw-r--r--src/mongo/db/serverless/shard_split_donor_service.cpp37
2 files changed, 12 insertions, 30 deletions
diff --git a/jstests/serverless/shard_split_abort_while_committing.js b/jstests/serverless/shard_split_abort_while_committing.js
index a72e0e78144..e3688236657 100644
--- a/jstests/serverless/shard_split_abort_while_committing.js
+++ b/jstests/serverless/shard_split_abort_while_committing.js
@@ -7,10 +7,7 @@
load("jstests/libs/fail_point_util.js");
load("jstests/serverless/libs/basic_serverless_test.js");
-const failpoints = [
- "pauseShardSplitBeforeSendingStepUpToRecipients",
- "pauseShardSplitAfterUpdatingToCommittedState"
-];
+const failpoints = ["pauseShardSplitAfterUpdatingToCommittedState"];
function testAbortAfterSplitIsAppliedStillsCommits(failpoint) {
"use strict";
diff --git a/src/mongo/db/serverless/shard_split_donor_service.cpp b/src/mongo/db/serverless/shard_split_donor_service.cpp
index 5d59e965d3e..26e17a3e2b5 100644
--- a/src/mongo/db/serverless/shard_split_donor_service.cpp
+++ b/src/mongo/db/serverless/shard_split_donor_service.cpp
@@ -71,7 +71,6 @@ MONGO_FAIL_POINT_DEFINE(pauseShardSplitBeforeSplitConfigRemoval);
MONGO_FAIL_POINT_DEFINE(skipShardSplitRecipientCleanup);
MONGO_FAIL_POINT_DEFINE(pauseShardSplitBeforeLeavingBlockingState);
MONGO_FAIL_POINT_DEFINE(pauseShardSplitAfterUpdatingToCommittedState);
-MONGO_FAIL_POINT_DEFINE(pauseShardSplitBeforeSendingStepUpToRecipients);
MONGO_FAIL_POINT_DEFINE(pauseShardSplitAfterReceivingAbortCmd);
const Backoff kExponentialBackoff(Seconds(1), Milliseconds::max());
@@ -753,8 +752,7 @@ ExecutorFuture<void> ShardSplitDonorService::DonorStateMachine::_applySplitConfi
ExecutorFuture<void> remoteAdminCommand(TaskExecutorPtr executor,
const CancellationToken& token,
const HostAndPort remoteNode,
- const BSONObj& command,
- std::function<bool(Status)> untilCondition = nullptr) {
+ const BSONObj& command) {
return AsyncTry([executor, token, remoteNode, command] {
executor::RemoteCommandRequest request(remoteNode, "admin", command, nullptr);
auto hasWriteConcern = command.hasField(WriteConcernOptions::kWriteConcernField);
@@ -769,27 +767,15 @@ ExecutorFuture<void> remoteAdminCommand(TaskExecutorPtr executor,
return status;
});
})
- .until([untilCondition](Status status) {
- if (untilCondition) {
- return untilCondition(status);
- }
-
- return status.isOK() ||
- (!ErrorCodes::isRetriableError(status) &&
- !ErrorCodes::isNetworkTimeoutError(status));
- })
- .withBackoffBetweenIterations(kExponentialBackoff)
+ .until([](Status status) { return status.isOK(); })
.on(executor, token);
}
ExecutorFuture<void> sendStepUpToRecipient(TaskExecutorPtr executor,
const CancellationToken& token,
const HostAndPort recipientPrimary) {
- return remoteAdminCommand(executor,
- token,
- recipientPrimary,
- BSON("replSetStepUp" << 1 << "skipDryRun" << true),
- [](Status status) { return status.isOK(); });
+ return remoteAdminCommand(
+ executor, token, recipientPrimary, BSON("replSetStepUp" << 1 << "skipDryRun" << true));
}
ExecutorFuture<void> waitForMajorityWriteOnRecipient(TaskExecutorPtr executor,
@@ -823,10 +809,7 @@ ShardSplitDonorService::DonorStateMachine::_waitForSplitAcceptanceAndEnterCommit
return ExecutorFuture(**executor)
.then([&]() { return _splitAcceptancePromise.getFuture(); })
- .then([this, executor, primaryToken](const HostAndPort& recipientPrimary) {
- // only cancel operations on stepdown from here out
- _cancelableOpCtxFactory.emplace(primaryToken, _markKilledExecutor);
-
+ .then([this, executor, abortToken](const HostAndPort& recipientPrimary) {
auto opCtx = _cancelableOpCtxFactory->makeOperationContext(&cc());
if (MONGO_unlikely(pauseShardSplitBeforeLeavingBlockingState.shouldFail())) {
pauseShardSplitBeforeLeavingBlockingState.execute([&](const BSONObj& data) {
@@ -861,19 +844,21 @@ ShardSplitDonorService::DonorStateMachine::_waitForSplitAcceptanceAndEnterCommit
? *_splitAcceptanceTaskExecutorForTest
: **executor;
- pauseShardSplitBeforeSendingStepUpToRecipients.pauseWhileSet();
- return sendStepUpToRecipient(remoteCommandExecutor, primaryToken, recipientPrimary)
- .then([this, remoteCommandExecutor, primaryToken, recipientPrimary]() {
+ return sendStepUpToRecipient(remoteCommandExecutor, abortToken, recipientPrimary)
+ .then([this, remoteCommandExecutor, abortToken, recipientPrimary]() {
LOGV2(8423365,
"Waiting for majority commit on recipient primary",
"id"_attr = _migrationId);
return waitForMajorityWriteOnRecipient(
- remoteCommandExecutor, primaryToken, recipientPrimary);
+ remoteCommandExecutor, abortToken, recipientPrimary);
});
})
.thenRunOn(**executor)
.then([this, executor, primaryToken]() {
+ // only cancel operations on stepdown from here out
+ _cancelableOpCtxFactory.emplace(primaryToken, _markKilledExecutor);
+
LOGV2(6142503, "Entering 'committed' state.", "id"_attr = _stateDoc.getId());
auto opCtx = _cancelableOpCtxFactory->makeOperationContext(&cc());
pauseShardSplitAfterUpdatingToCommittedState.pauseWhileSet(opCtx.get());