summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Wangensteen <george.wangensteen@mongodb.com>2022-09-15 19:14:41 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-15 21:51:59 +0000
commitfc39f3acfa1cdc9e9507267832b4091511656117 (patch)
treec40e83d133fe38793851b01ede845723ad4e72fb
parent2c903273045c274083dbb7a25746e927100ba634 (diff)
downloadmongo-fc39f3acfa1cdc9e9507267832b4091511656117.tar.gz
SERVER-68555 Add RemoteCommandExecutionError to RCR API
-rw-r--r--src/mongo/base/error_codes.yml2
-rw-r--r--src/mongo/db/SConscript1
-rw-r--r--src/mongo/executor/SConscript11
-rw-r--r--src/mongo/executor/hedged_remote_command_runner.h43
-rw-r--r--src/mongo/executor/hedged_remote_command_runner_test.cpp79
-rw-r--r--src/mongo/executor/network_interface_mock.cpp31
-rw-r--r--src/mongo/executor/network_interface_tl.cpp17
-rw-r--r--src/mongo/executor/remote_command_runner.cpp27
-rw-r--r--src/mongo/executor/remote_command_runner_error_info.cpp50
-rw-r--r--src/mongo/executor/remote_command_runner_error_info.h151
-rw-r--r--src/mongo/executor/remote_command_runner_test.cpp67
-rw-r--r--src/mongo/s/SConscript1
-rw-r--r--src/mongo/s/hedge_options_util.h15
-rw-r--r--src/mongo/shell/SConscript1
14 files changed, 416 insertions, 80 deletions
diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml
index 8cc122bfbe1..5d4646901ed 100644
--- a/src/mongo/base/error_codes.yml
+++ b/src/mongo/base/error_codes.yml
@@ -492,6 +492,8 @@ error_codes:
- {code: 380, name: RequestAlreadyFulfilled}
- {code: 381, name: ConflictingServerlessOperation}
+ - {code: 382, name: RemoteCommandExecutionError, extra: RemoteCommandExecutionErrorInfo, categories: [InternalOnly]}
+
# Error codes 4000-8999 are reserved.
# Non-sequential error codes for compatibility only)
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index c0514e4dd44..fe62eceee81 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -2153,6 +2153,7 @@ env.Library(
# NOTE: If you need to add a static or mongo initializer to mongod startup,
# please add that library here, as a private library dependency.
'$BUILD_DIR/mongo/executor/network_interface_factory',
+ '$BUILD_DIR/mongo/executor/remote_command_runner_error_info',
'$BUILD_DIR/mongo/rpc/rpc',
'$BUILD_DIR/mongo/s/commands/sharded_cluster_sharding_commands',
'$BUILD_DIR/mongo/scripting/scripting_server',
diff --git a/src/mongo/executor/SConscript b/src/mongo/executor/SConscript
index 815159efd70..d0f2301cd21 100644
--- a/src/mongo/executor/SConscript
+++ b/src/mongo/executor/SConscript
@@ -246,6 +246,16 @@ env.Library(
)
env.Library(
+ target='remote_command_runner_error_info',
+ source=[
+ 'remote_command_runner_error_info.cpp',
+ ],
+ LIBDEPS=[
+ '$BUILD_DIR/mongo/base',
+ ],
+)
+
+env.Library(
target='remote_command_runner',
source=[
'remote_command_runner.cpp',
@@ -255,6 +265,7 @@ env.Library(
'$BUILD_DIR/mongo/db/service_context_test_fixture',
'$BUILD_DIR/mongo/rpc/command_status',
'remote_command',
+ 'remote_command_runner_error_info',
'task_executor_interface',
],
)
diff --git a/src/mongo/executor/hedged_remote_command_runner.h b/src/mongo/executor/hedged_remote_command_runner.h
index 66f4a51d8d1..8f72312c2c7 100644
--- a/src/mongo/executor/hedged_remote_command_runner.h
+++ b/src/mongo/executor/hedged_remote_command_runner.h
@@ -37,9 +37,11 @@
#include "mongo/db/query/cursor_response.h"
#include "mongo/executor/remote_command_response.h"
#include "mongo/executor/remote_command_runner.h"
+#include "mongo/executor/remote_command_runner_error_info.h"
#include "mongo/executor/remote_command_targeter.h"
#include "mongo/executor/task_executor.h"
#include "mongo/rpc/get_status_from_command_result.h"
+#include "mongo/s/hedge_options_util.h"
#include "mongo/s/mongos_server_parameters_gen.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/cancellation.h"
@@ -155,27 +157,34 @@ SemiFuture<RemoteCommandRunnerResponse<typename CommandType::Reply>> doHedgedReq
* the future with index 0, which we treat as the "authoritative" request. This is the
* codepath followed when we are not hedging or there is only 1 target provided.
*/
- return whenAnyThat(std::move(requests),
- [](StatusWith<SingleResponse> response, size_t index) {
- Status commandStatus = response.getStatus();
- if (index == 0) {
- return true;
- }
-
- if (commandStatus == ErrorCodes::MaxTimeMSExpired ||
- commandStatus == ErrorCodes::StaleDbVersion ||
- ErrorCodes::isStaleShardVersionError(commandStatus)) {
- return false;
- }
- return true;
- });
+ return whenAnyThat(
+ std::move(requests), [](StatusWith<SingleResponse> response, size_t index) {
+ Status commandStatus = response.getStatus();
+
+ if (index == 0) {
+ return true;
+ }
+ if (commandStatus.code() == Status::OK()) {
+ return true;
+ }
+
+ // TODO SERVER-69592 Account for interior executor shutdown
+ invariant(commandStatus.code() == ErrorCodes::RemoteCommandExecutionError);
+ boost::optional<Status> remoteErr;
+ auto extraInfo = commandStatus.extraInfo<RemoteCommandExecutionErrorInfo>();
+ if (extraInfo->isRemote()) {
+ remoteErr = extraInfo->asRemote().getRemoteCommandResult();
+ }
+
+ if (remoteErr && isIgnorableAsHedgeResult(*remoteErr)) {
+ return false;
+ }
+ return true;
+ });
})
.onCompletion([hedgeCancellationToken](StatusWith<SingleResponse> result) mutable {
// TODO SERVER-68101 add retry logic
- // TODO SERVER-68555 add extra error handling info
-
hedgeCancellationToken.cancel();
-
return result;
})
.semi();
diff --git a/src/mongo/executor/hedged_remote_command_runner_test.cpp b/src/mongo/executor/hedged_remote_command_runner_test.cpp
index 3daeded83f6..b879d5a2a02 100644
--- a/src/mongo/executor/hedged_remote_command_runner_test.cpp
+++ b/src/mongo/executor/hedged_remote_command_runner_test.cpp
@@ -141,6 +141,11 @@ TEST_F(HedgedCommandRunnerTest, FindHedgeRequestTwoHosts) {
RemoteCommandRunnerResponse<CursorInitialReply> res = resultFuture.get();
+ auto network = getNetworkInterfaceMock();
+ network->enterNetwork();
+ network->runReadyNetworkOperations();
+ network->exitNetwork();
+
auto counters = getNetworkInterfaceCounters();
ASSERT_EQ(counters.succeeded, 1);
ASSERT_EQ(counters.canceled, 1);
@@ -171,6 +176,11 @@ TEST_F(HedgedCommandRunnerTest, FindHedgeRequestThreeHosts) {
RemoteCommandRunnerResponse<CursorInitialReply> res = resultFuture.get();
+ auto network = getNetworkInterfaceMock();
+ network->enterNetwork();
+ network->runReadyNetworkOperations();
+ network->exitNetwork();
+
auto counters = getNetworkInterfaceCounters();
ASSERT_EQ(counters.succeeded, 1);
ASSERT_EQ(counters.canceled, 2);
@@ -263,11 +273,16 @@ TEST_F(HedgedCommandRunnerTest, FirstCommandFailsWithSignificantError) {
return Status(ErrorCodes::NetworkTimeout, "mock");
});
+ auto network = getNetworkInterfaceMock();
+ network->enterNetwork();
+ network->runReadyNetworkOperations();
+ network->exitNetwork();
+
auto counters = getNetworkInterfaceCounters();
ASSERT_EQ(counters.failed, 1);
ASSERT_EQ(counters.canceled, 1);
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::NetworkTimeout);
+ ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::RemoteCommandExecutionError);
}
/**
@@ -292,19 +307,19 @@ TEST_F(HedgedCommandRunnerTest, BothCommandsFailWithSkippableError) {
onCommand([&](const auto& request) {
ASSERT(request.cmdObj["find"]);
- return Status(ErrorCodes::MaxTimeMSExpired, "mock");
+ return createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock"));
});
onCommand([&](const auto& request) {
ASSERT(request.cmdObj["find"]);
- return Status(ErrorCodes::MaxTimeMSExpired, "mock");
+ return createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock"));
});
auto counters = getNetworkInterfaceCounters();
- ASSERT_EQ(counters.failed, 2);
+ ASSERT_EQ(counters.sent, 2);
ASSERT_EQ(counters.canceled, 0);
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::MaxTimeMSExpired);
+ ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::RemoteCommandExecutionError);
}
TEST_F(HedgedCommandRunnerTest, AllCommandsFailWithSkippableError) {
@@ -324,7 +339,7 @@ TEST_F(HedgedCommandRunnerTest, AllCommandsFailWithSkippableError) {
CancellationToken::uncancelable());
auto network = getNetworkInterfaceMock();
- auto now = getNetworkInterfaceMock()->now();
+ auto now = network->now();
network->enterNetwork();
NetworkInterfaceMock::NetworkOperationIterator noi1 = network->getNextReadyRequest();
@@ -335,25 +350,42 @@ TEST_F(HedgedCommandRunnerTest, AllCommandsFailWithSkippableError) {
auto secondRequest = (*noi2).getRequestOnAny();
auto thirdRequest = (*noi3).getRequestOnAny();
+ auto remoteMaxTimeMSError = RemoteCommandResponse{
+ createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)};
if (firstRequest.target[0] == kThreeHosts[0]) {
- network->scheduleErrorResponse(
- noi1, now + Milliseconds(1000), Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi1,
+ now + Milliseconds(1000),
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
} else {
- network->scheduleErrorResponse(noi1, Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi1,
+ now,
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
}
if (secondRequest.target[0] == kThreeHosts[0]) {
- network->scheduleErrorResponse(
- noi2, now + Milliseconds(1000), Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi2,
+ now + Milliseconds(1000),
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
} else {
- network->scheduleErrorResponse(noi2, Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi2,
+ now,
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
}
if (thirdRequest.target[0] == kThreeHosts[0]) {
- network->scheduleErrorResponse(
- noi3, now + Milliseconds(1000), Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi3,
+ now + Milliseconds(1000),
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
} else {
- network->scheduleErrorResponse(noi3, Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi3,
+ now,
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
}
network->runUntil(now + Milliseconds(1500));
@@ -361,10 +393,10 @@ TEST_F(HedgedCommandRunnerTest, AllCommandsFailWithSkippableError) {
auto counters = network->getCounters();
network->exitNetwork();
- ASSERT_EQ(counters.failed, 3);
+ ASSERT_EQ(counters.succeeded, 3);
ASSERT_EQ(counters.canceled, 0);
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::MaxTimeMSExpired);
+ ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::RemoteCommandExecutionError);
}
/**
@@ -407,7 +439,10 @@ TEST_F(HedgedCommandRunnerTest, FirstCommandFailsWithSkippableErrorNextSucceeds)
if (firstRequest.target[0] == kThreeHosts[0]) {
network->scheduleSuccessfulResponse(noi1, now + Milliseconds(1000), successResponse);
} else {
- network->scheduleErrorResponse(noi1, Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi1,
+ now,
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
}
// if the second request is the authoritative one, send a delayed success response
@@ -415,7 +450,10 @@ TEST_F(HedgedCommandRunnerTest, FirstCommandFailsWithSkippableErrorNextSucceeds)
if (secondRequest.target[0] == kThreeHosts[0]) {
network->scheduleSuccessfulResponse(noi2, now + Milliseconds(1000), successResponse);
} else {
- network->scheduleErrorResponse(noi2, Status(ErrorCodes::MaxTimeMSExpired, "mock"));
+ network->scheduleResponse(
+ noi2,
+ now,
+ {createErrorResponse(Status(ErrorCodes::MaxTimeMSExpired, "mock")), Milliseconds(1)});
}
network->runUntil(now + Milliseconds(1500));
@@ -423,8 +461,7 @@ TEST_F(HedgedCommandRunnerTest, FirstCommandFailsWithSkippableErrorNextSucceeds)
auto counters = network->getCounters();
network->exitNetwork();
- ASSERT_EQ(counters.failed, 1);
- ASSERT_EQ(counters.succeeded, 1);
+ ASSERT_EQ(counters.succeeded, 2);
ASSERT_EQ(counters.canceled, 0);
ASSERT_EQ(resultFuture.get().response.getCursor()->getNs(),
diff --git a/src/mongo/executor/network_interface_mock.cpp b/src/mongo/executor/network_interface_mock.cpp
index 454e7be6fa1..962fb43851d 100644
--- a/src/mongo/executor/network_interface_mock.cpp
+++ b/src/mongo/executor/network_interface_mock.cpp
@@ -160,7 +160,6 @@ void NetworkInterfaceMock::_interruptWithResponse_inlock(const CallbackHandle& c
if (!noi->isFinished()) {
// We've effectively observed the NetworkOperation.
noi->markAsProcessing();
- _counters.canceled++;
_scheduleResponse_inlock(noi, _now_inlock(), response);
}
}
@@ -604,13 +603,6 @@ void NetworkInterfaceMock::_runReadyNetworkOperations_inlock(stdx::unique_lock<s
"request"_attr = noi->getRequest(),
"response"_attr = response.response);
- _counters.sent++;
- if (response.response.status.isOK()) {
- _counters.succeeded++;
- } else {
- _counters.failed++;
- }
-
if (_metadataHook) {
_metadataHook
->readReplyMetadata(noi->getRequest().opCtx,
@@ -619,8 +611,27 @@ void NetworkInterfaceMock::_runReadyNetworkOperations_inlock(stdx::unique_lock<s
.transitional_ignore();
}
- noi->processResponse(std::move(response));
-
+ // The NetworkInterface can recieve multiple responses for a particular request (e.g.
+ // cancellation and a 'true' scheduled response). But each request can only have one logical
+ // response. This choice of the one logical response is mediated by the _isFinished field of
+ // the NetworkOperation; whichever response sets this first via
+ // NetworkOperation::processResponse wins. NetworkOperation::processResponse returns `true`
+ // if the given response was accepted by the NetworkOperation as its sole logical response.
+ //
+ // We care about this here because we only want to increment the counters for operations
+ // succeeded/failed for the responses that are actually used,
+ Status localResponseStatus = response.response.status;
+ bool noiUsedThisResponse = noi->processResponse(std::move(response));
+ if (noiUsedThisResponse) {
+ _counters.sent++;
+ if (localResponseStatus.isOK()) {
+ _counters.succeeded++;
+ } else if (ErrorCodes::isCancellationError(localResponseStatus)) {
+ _counters.canceled++;
+ } else {
+ _counters.failed++;
+ }
+ }
lk->lock();
}
invariant(_currentlyRunning == kNetworkThread);
diff --git a/src/mongo/executor/network_interface_tl.cpp b/src/mongo/executor/network_interface_tl.cpp
index f6979f18462..659278741b1 100644
--- a/src/mongo/executor/network_interface_tl.cpp
+++ b/src/mongo/executor/network_interface_tl.cpp
@@ -41,6 +41,7 @@
#include "mongo/executor/network_interface_tl_gen.h"
#include "mongo/logv2/log.h"
#include "mongo/rpc/get_status_from_command_result.h"
+#include "mongo/s/hedge_options_util.h"
#include "mongo/transport/transport_layer_manager.h"
#include "mongo/util/concurrency/idle_thread_block.h"
#include "mongo/util/net/socket_utils.h"
@@ -102,19 +103,6 @@ bool catchingInvoke(F&& f, EH&& eh, StringData hint) {
}
}
-/**
- * We ignore a subset of errors that may occur while running hedged operations (e.g., maxTimeMS
- * expiration), as the operation may safely succeed despite their failure. For example, a network
- * timeout error indicates the remote host experienced a timeout while running a remote-command as
- * part of executing the hedged operation. This is by no means an indication that the operation has
- * failed, as other hedged operations may still succeed.
- * TODO SERVER-68704 will include other error categories that are safe to ignore.
- */
-bool skipHedgeResult(const Status& status) {
- return status == ErrorCodes::MaxTimeMSExpired || status == ErrorCodes::StaleDbVersion ||
- ErrorCodes::isNetworkTimeoutError(status) || ErrorCodes::isStaleShardVersionError(status);
-}
-
template <typename IA, typename IB, typename F>
int compareTransformed(IA a1, IA a2, IB b1, IB b2, F&& f) {
for (;; ++a1, ++b1)
@@ -125,7 +113,6 @@ int compareTransformed(IA a1, IA a2, IB b1, IB b2, F&& f) {
else if (int r = f(*a1) - f(*b1))
return r;
}
-
} // namespace
/**
@@ -949,7 +936,7 @@ void NetworkInterfaceTL::RequestState::resolve(Future<RemoteCommandResponse> fut
returnConnection(status);
const auto commandStatus = getStatusFromCommandResult(response.data);
- if (isHedge && skipHedgeResult(commandStatus)) {
+ if (isHedge && isIgnorableAsHedgeResult(commandStatus)) {
LOGV2_DEBUG(4660701,
2,
"Hedged request returned status",
diff --git a/src/mongo/executor/remote_command_runner.cpp b/src/mongo/executor/remote_command_runner.cpp
index 7c7c9280014..6b909689094 100644
--- a/src/mongo/executor/remote_command_runner.cpp
+++ b/src/mongo/executor/remote_command_runner.cpp
@@ -30,6 +30,7 @@
#include "mongo/executor/remote_command_runner.h"
#include "mongo/base/error_codes.h"
#include "mongo/executor/remote_command_request.h"
+#include "mongo/executor/remote_command_runner_error_info.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/future.h"
#include "mongo/util/net/hostandport.h"
@@ -38,6 +39,14 @@
namespace mongo::executor::remote_command_runner {
namespace detail {
namespace {
+Status makeErrorIfNeeded(TaskExecutor::ResponseOnAnyStatus r) {
+ if (r.status.isOK() && getStatusFromCommandResult(r.data).isOK() &&
+ getWriteConcernStatusFromCommandResult(r.data).isOK() &&
+ getFirstWriteErrorStatusFromCommandResult(r.data).isOK()) {
+ return Status::OK();
+ }
+ return {RemoteCommandExecutionErrorInfo(r), "Remote command execution failed"};
+}
const auto getRCRImpl = ServiceContext::declareDecoration<std::unique_ptr<RemoteCommandRunner>>();
} // namespace
@@ -65,14 +74,16 @@ public:
targets, dbName.toString(), cmdBSON, rpc::makeEmptyMetadata(), opCtx);
return exec->scheduleRemoteCommandOnAny(executorRequest, token);
})
- .then([&, exec = std::move(exec)](RemoteCommandOnAnyResponse r) {
- // Ensure the command didn't have a local error, or any remote errors, preferring
- // to propogate ok: 0 errors over writeConcern errors over write errors.
- iassert(r.status);
- iassert(getStatusFromCommandResult(r.data));
- iassert(getWriteConcernStatusFromCommandResult(r.data));
- iassert(getFirstWriteErrorStatusFromCommandResult(r.data));
-
+ .onError([](Status s) -> StatusWith<TaskExecutor::ResponseOnAnyStatus> {
+ // If there was a scheduling error or other local error before the
+ // command was accepted by the executor.
+ return Status{RemoteCommandExecutionErrorInfo(s),
+ "Remote command execution failed"};
+ })
+ .then([](TaskExecutor::ResponseOnAnyStatus r) {
+ // TODO SERVER-69592 account for interior executor shutdown
+ auto s = makeErrorIfNeeded(r);
+ uassertStatusOK(s);
return RemoteCommandInternalResponse{r.data, r.target.get()};
});
}
diff --git a/src/mongo/executor/remote_command_runner_error_info.cpp b/src/mongo/executor/remote_command_runner_error_info.cpp
new file mode 100644
index 00000000000..04a34cec8bb
--- /dev/null
+++ b/src/mongo/executor/remote_command_runner_error_info.cpp
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+
+#include "mongo/executor/remote_command_runner_error_info.h"
+#include "mongo/base/init.h"
+#include "mongo/bson/bsonobjbuilder.h"
+
+namespace mongo {
+namespace {
+
+MONGO_INIT_REGISTER_ERROR_EXTRA_INFO(RemoteCommandExecutionErrorInfo);
+
+} // namespace
+
+std::shared_ptr<const ErrorExtraInfo> RemoteCommandExecutionErrorInfo::parse(const BSONObj& obj) {
+ MONGO_UNREACHABLE;
+}
+
+void RemoteCommandExecutionErrorInfo::serialize(BSONObjBuilder* bob) const {
+ MONGO_UNREACHABLE;
+}
+
+} // namespace mongo
diff --git a/src/mongo/executor/remote_command_runner_error_info.h b/src/mongo/executor/remote_command_runner_error_info.h
new file mode 100644
index 00000000000..b51936442cc
--- /dev/null
+++ b/src/mongo/executor/remote_command_runner_error_info.h
@@ -0,0 +1,151 @@
+/**
+ * Copyright (C) 2022-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/error_extra_info.h"
+#include "mongo/bson/bsonobj.h"
+#include "mongo/executor/remote_command_response.h"
+#include "mongo/rpc/get_status_from_command_result.h"
+#include "mongo/stdx/variant.h"
+
+namespace mongo {
+class BSONObjBuilder;
+
+enum class CommandErrorProvenance { kLocal, kRemote };
+
+
+/**
+ * Contains information to augment the 'RemoteCommandExecutionError' error code. In particular, this
+ * class holds the provenance and data of the underlying error(s).
+ */
+class RemoteCommandExecutionErrorInfo final : public ErrorExtraInfo {
+public:
+ /**
+ * Nested class used to describe remote errors.
+ */
+ class RemoteError {
+ public:
+ // The BSON recieved over-the-wire that encodes the remote
+ // error is stored and used to construct this in-memory representation.
+ explicit RemoteError(BSONObj obj)
+ : _error{obj},
+ _remoteCommandResult{getStatusFromCommandResult(_error)},
+ _remoteCommandWriteConcernError{getWriteConcernStatusFromCommandResult(_error)},
+ _remoteCommandFirstWriteError{getFirstWriteErrorStatusFromCommandResult(_error)} {
+ // The buffer backing the default empty BSONObj has static duration so it is effectively
+ // owned.
+ invariant(_error.isOwned() || _error.objdata() == BSONObj().objdata());
+ if (BSONElement errLabelsElem = _error["errorLabels"]; !errLabelsElem.eoo()) {
+ _errLabels = errLabelsElem.Array();
+ }
+ }
+
+ Status getRemoteCommandResult() const {
+ return _remoteCommandResult;
+ }
+
+ Status getRemoteCommandWriteConcernError() const {
+ return _remoteCommandWriteConcernError;
+ }
+
+ Status getRemoteCommandFirstWriteError() const {
+ return _remoteCommandFirstWriteError;
+ }
+
+ BSONObj getResponseObj() const {
+ return _error;
+ }
+
+ std::vector<BSONElement> getErrorLabels() const {
+ return _errLabels;
+ }
+
+ private:
+ const BSONObj _error;
+ Status _remoteCommandResult;
+ Status _remoteCommandWriteConcernError;
+ Status _remoteCommandFirstWriteError;
+ std::vector<BSONElement> _errLabels;
+ };
+
+ // Required member of every ErrorExtraInfo.
+ static constexpr auto code = ErrorCodes::RemoteCommandExecutionError;
+
+ // Unused and marked unreachable - the RemoteCommandExecutionError is InternalOnly and should
+ // never be encoded in a BSONObj / recieved or sent over-the-wire.
+ static std::shared_ptr<const ErrorExtraInfo> parse(const BSONObj& obj);
+
+ /**
+ * Construct the relevant extra info from the RemoteCommandResponse provided by the TaskExecutor
+ * used to invoke the remote command.
+ */
+ explicit RemoteCommandExecutionErrorInfo(executor::RemoteCommandOnAnyResponse rcr)
+ : _error{RemoteError(rcr.data)} {
+ if (!rcr.status.isOK()) {
+ _prov = CommandErrorProvenance::kLocal;
+ _error = rcr.status;
+ return;
+ }
+ _prov = CommandErrorProvenance::kRemote;
+ }
+
+ /**
+ * Construct the relevant extra info from an error status - used if a remote command invokation
+ * attempt fails before it reaches the TaskExecutor level.
+ */
+ explicit RemoteCommandExecutionErrorInfo(Status s)
+ : _prov{CommandErrorProvenance::kLocal}, _error{s} {}
+
+ bool isLocal() const {
+ return _prov == CommandErrorProvenance::kLocal;
+ }
+
+ bool isRemote() const {
+ return _prov == CommandErrorProvenance::kRemote;
+ }
+
+ const RemoteError& asRemote() const {
+ return stdx::get<const RemoteError>(_error);
+ }
+
+ Status asLocal() const {
+ return stdx::get<Status>(_error);
+ }
+
+ // Unused and marked unreachable - the RemoteCommandExecutionError is InternalOnly and should
+ // never be encoded in a BSONObj / recieved or sent over-the-wire.
+ void serialize(BSONObjBuilder* bob) const final;
+
+private:
+ CommandErrorProvenance _prov;
+ stdx::variant<Status, const RemoteError> _error;
+};
+
+} // namespace mongo
diff --git a/src/mongo/executor/remote_command_runner_test.cpp b/src/mongo/executor/remote_command_runner_test.cpp
index 281397ed22b..d4560933e3c 100644
--- a/src/mongo/executor/remote_command_runner_test.cpp
+++ b/src/mongo/executor/remote_command_runner_test.cpp
@@ -27,15 +27,15 @@
* it in the license file.
*/
+#include "mongo/bson/oid.h"
+#include "mongo/db/repl/hello_gen.h"
+#include "mongo/executor/network_test_env.h"
#include "mongo/executor/remote_command_response.h"
#include "mongo/executor/remote_command_retry_policy.h"
#include "mongo/executor/remote_command_runner.h"
+#include "mongo/executor/remote_command_runner_error_info.h"
#include "mongo/executor/remote_command_runner_test_fixture.h"
#include "mongo/executor/remote_command_targeter.h"
-
-#include "mongo/bson/oid.h"
-#include "mongo/db/repl/hello_gen.h"
-#include "mongo/executor/network_test_env.h"
#include "mongo/executor/task_executor.h"
#include "mongo/executor/task_executor_test_fixture.h"
#include "mongo/executor/thread_pool_task_executor.h"
@@ -97,7 +97,17 @@ TEST_F(RemoteCommandRunnerTestFixture, LocalError) {
return Status(ErrorCodes::NetworkTimeout, "mock");
});
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::NetworkTimeout);
+ auto error = resultFuture.getNoThrow().getStatus();
+
+ // The error returned by our API should always be RemoteCommandExecutionError
+ ASSERT_EQ(error.code(), ErrorCodes::RemoteCommandExecutionError);
+ // Make sure we can extract the extra error info
+ auto extraInfo = error.extraInfo<RemoteCommandExecutionErrorInfo>();
+ ASSERT(extraInfo);
+ // Make sure the extra info indicates the error was local, and that the
+ // local error (which is just a Status) has the correct code.
+ ASSERT(extraInfo->isLocal());
+ ASSERT_EQ(extraInfo->asLocal().code(), ErrorCodes::NetworkTimeout);
}
/*
@@ -119,7 +129,19 @@ TEST_F(RemoteCommandRunnerTestFixture, RemoteError) {
return createErrorResponse(Status(ErrorCodes::BadValue, "mock"));
});
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::BadValue);
+ auto error = resultFuture.getNoThrow().getStatus();
+ ASSERT_EQ(error.code(), ErrorCodes::RemoteCommandExecutionError);
+
+ auto extraInfo = error.extraInfo<RemoteCommandExecutionErrorInfo>();
+ ASSERT(extraInfo);
+
+ ASSERT(extraInfo->isRemote());
+ auto remoteError = extraInfo->asRemote();
+ ASSERT_EQ(remoteError.getRemoteCommandResult(), Status(ErrorCodes::BadValue, "mock"));
+
+ // No write concern or write errors expected
+ ASSERT_EQ(remoteError.getRemoteCommandWriteConcernError(), Status::OK());
+ ASSERT_EQ(remoteError.getRemoteCommandFirstWriteError(), Status::OK());
}
/*
@@ -146,7 +168,20 @@ TEST_F(RemoteCommandRunnerTestFixture, WriteConcernError) {
return resWithWriteConcernError;
});
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::WriteConcernFailed);
+ auto error = resultFuture.getNoThrow().getStatus();
+ ASSERT_EQ(error.code(), ErrorCodes::RemoteCommandExecutionError);
+
+ auto extraInfo = error.extraInfo<RemoteCommandExecutionErrorInfo>();
+ ASSERT(extraInfo);
+
+ ASSERT(extraInfo->isRemote());
+ auto remoteError = extraInfo->asRemote();
+ ASSERT_EQ(remoteError.getRemoteCommandWriteConcernError(),
+ Status(ErrorCodes::WriteConcernFailed, "mock"));
+
+ // No top-level command or write errors expected
+ ASSERT_EQ(remoteError.getRemoteCommandFirstWriteError(), Status::OK());
+ ASSERT_EQ(remoteError.getRemoteCommandResult(), Status::OK());
}
/*
@@ -173,8 +208,22 @@ TEST_F(RemoteCommandRunnerTestFixture, WriteError) {
return resWithWriteError;
});
-
- ASSERT_THROWS_CODE(resultFuture.get(), DBException, ErrorCodes::DocumentValidationFailure);
+ auto error = resultFuture.getNoThrow().getStatus();
+ ASSERT_EQ(error.code(), ErrorCodes::RemoteCommandExecutionError);
+
+ auto extraInfo = error.extraInfo<RemoteCommandExecutionErrorInfo>();
+ ASSERT(extraInfo);
+
+ ASSERT(extraInfo->isRemote());
+ auto remoteError = extraInfo->asRemote();
+ ASSERT_EQ(remoteError.getRemoteCommandFirstWriteError(),
+ Status(ErrorCodes::DocumentValidationFailure,
+ "Document failed validation",
+ BSON("errInfo" << writeErrorExtraInfo)));
+
+ // No top-level command or write errors expected
+ ASSERT_EQ(remoteError.getRemoteCommandWriteConcernError(), Status::OK());
+ ASSERT_EQ(remoteError.getRemoteCommandResult(), Status::OK());
}
/*
diff --git a/src/mongo/s/SConscript b/src/mongo/s/SConscript
index f7746d9b4be..17d29525eea 100644
--- a/src/mongo/s/SConscript
+++ b/src/mongo/s/SConscript
@@ -447,6 +447,7 @@ env.Library(
'$BUILD_DIR/mongo/db/stats/counters',
'$BUILD_DIR/mongo/db/windows_options' if env.TargetOSIs('windows') else [],
'$BUILD_DIR/mongo/executor/hedging_metrics',
+ '$BUILD_DIR/mongo/executor/remote_command_runner_error_info',
'$BUILD_DIR/mongo/transport/message_compressor_options_server',
'$BUILD_DIR/mongo/transport/service_entry_point',
'$BUILD_DIR/mongo/transport/transport_layer_manager',
diff --git a/src/mongo/s/hedge_options_util.h b/src/mongo/s/hedge_options_util.h
index 58a14151410..6c2a243d08d 100644
--- a/src/mongo/s/hedge_options_util.h
+++ b/src/mongo/s/hedge_options_util.h
@@ -45,4 +45,19 @@ namespace mongo {
void extractHedgeOptions(const BSONObj& cmdObj,
const ReadPreferenceSetting& readPref,
executor::RemoteCommandRequestOnAny::Options& options);
+
+/**
+ * We ignore a subset of errors that may occur while running hedged operations (e.g., maxTimeMS
+ * expiration), as the operation may safely succeed despite their failure. For example, a network
+ * timeout error indicates the remote host experienced a timeout while running a remote-command as
+ * part of executing the hedged operation. This is by no means an indication that the operation has
+ * failed, as other hedged operations may still succeed. This function returns 'true' if we should
+ * ignore this error as a response to a hedged operation, and allow other hedges of the operation to
+ * possibly succeed.
+ * TODO SERVER-68704 will include other error categories that are safe to ignore.
+ */
+inline bool isIgnorableAsHedgeResult(const Status& status) {
+ return status == ErrorCodes::MaxTimeMSExpired || status == ErrorCodes::StaleDbVersion ||
+ ErrorCodes::isNetworkTimeoutError(status) || ErrorCodes::isStaleShardVersionError(status);
+}
} // namespace mongo
diff --git a/src/mongo/shell/SConscript b/src/mongo/shell/SConscript
index 54169474a69..c54c5a93524 100644
--- a/src/mongo/shell/SConscript
+++ b/src/mongo/shell/SConscript
@@ -266,6 +266,7 @@ if not has_option('noshell') and jsEngine:
"$BUILD_DIR/mongo/db/views/resolved_view",
"$BUILD_DIR/mongo/executor/network_interface_factory",
"$BUILD_DIR/mongo/executor/network_interface_thread_pool",
+ "$BUILD_DIR/mongo/executor/remote_command_runner_error_info",
"$BUILD_DIR/mongo/executor/thread_pool_task_executor",
"$BUILD_DIR/mongo/rpc/message",
"$BUILD_DIR/mongo/scripting/scripting",