summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAntonio Fuschetto <antonio.fuschetto@mongodb.com>2023-03-09 20:08:08 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-03-09 20:52:59 +0000
commitb01ba75c76f72bc82167662fdf3c55eb24085acb (patch)
treedc1e9d2bee0b4f0a45d9b5acf57a64f323f68a16 /src
parent52c4ac5618a6325a69cf68ab498ff3f5b0590b90 (diff)
downloadmongo-b01ba75c76f72bc82167662fdf3c55eb24085acb.tar.gz
SERVER-74185 Support for cleanup in Recoverable Sharding DDL Coordinator
Diffstat (limited to 'src')
-rw-r--r--src/mongo/base/error_codes.yml5
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.cpp26
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.h38
-rw-r--r--src/mongo/db/s/sharding_ddl_coordinator.idl13
-rw-r--r--src/mongo/db/s/sharding_ddl_util.cpp40
-rw-r--r--src/mongo/db/s/sharding_ddl_util.h8
-rw-r--r--src/mongo/db/s/sharding_ddl_util_test.cpp92
7 files changed, 218 insertions, 4 deletions
diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml
index d40868eee0a..15baffdae6a 100644
--- a/src/mongo/base/error_codes.yml
+++ b/src/mongo/base/error_codes.yml
@@ -506,7 +506,7 @@ error_codes:
- {code: 386, name: DuplicateKeyId}
- {code: 387, name: EncounteredFLEPayloadWhileRedacting}
-
+
- {code: 388, name: TransactionTooLargeForCache}
- {code: 389, name: LibmongocryptError}
@@ -516,6 +516,9 @@ error_codes:
- {code: 393, name: InvalidTenantId}
- {code: 394, name: MovePrimaryRecipientDocNotFound, categories: [InternalOnly]}
+
+ - {code: 395, name: TruncatedSerialization}
+
# Error codes 4000-8999 are reserved.
# Non-sequential error codes for compatibility only)
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.cpp b/src/mongo/db/s/sharding_ddl_coordinator.cpp
index 62b1e4eef44..5306b9c1415 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.cpp
+++ b/src/mongo/db/s/sharding_ddl_coordinator.cpp
@@ -226,6 +226,17 @@ ExecutorFuture<void> ShardingDDLCoordinator::_acquireLockAsync(
.on(**executor, token);
}
+ExecutorFuture<void> ShardingDDLCoordinator::_cleanupOnAbort(
+ std::shared_ptr<executor::ScopedTaskExecutor> executor,
+ const CancellationToken& token,
+ const Status& status) noexcept {
+ return ExecutorFuture<void>(**executor);
+}
+
+boost::optional<Status> ShardingDDLCoordinator::getAbortReason() const {
+ return boost::none;
+}
+
void ShardingDDLCoordinator::interrupt(Status status) {
LOGV2_DEBUG(5390535,
1,
@@ -348,17 +359,25 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
return status;
})
.then([this, executor, token, anchor = shared_from_this()] {
- return AsyncTry([this, executor, token] { return _runImpl(executor, token); })
+ return AsyncTry([this, executor, token] {
+ if (const auto& status = getAbortReason()) {
+ return _cleanupOnAbort(executor, token, *status);
+ }
+
+ return _runImpl(executor, token);
+ })
.until([this, token](Status status) {
// Retry until either:
// - The coordinator succeed
// - The coordinator failed with non-retryable error determined by the
// coordinator, or an already known retryable error
+ // - Cleanup is not planned
//
// If the token is not cancelled we retry because it could have been generated
// by a remote node.
if (!status.isOK() && !_completeOnError &&
- (_mustAlwaysMakeProgress() || _isRetriableErrorForDDLCoordinator(status)) &&
+ (getAbortReason() || _mustAlwaysMakeProgress() ||
+ _isRetriableErrorForDDLCoordinator(status)) &&
!token.isCanceled()) {
LOGV2_INFO(5656000,
"Re-executing sharding DDL coordinator",
@@ -383,7 +402,8 @@ SemiFuture<void> ShardingDDLCoordinator::run(std::shared_ptr<executor::ScopedTas
status.isA<ErrorCategory::ShutdownError>()) ||
token.isCanceled() || _completeOnError);
- auto completionStatus = status;
+ auto completionStatus =
+ !status.isOK() ? status : getAbortReason().get_value_or(Status::OK());
bool isSteppingDown = token.isCanceled();
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.h b/src/mongo/db/s/sharding_ddl_coordinator.h
index 6cbc4cbd099..7b53a15e6f7 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.h
+++ b/src/mongo/db/s/sharding_ddl_coordinator.h
@@ -161,6 +161,11 @@ private:
virtual ExecutorFuture<void> _runImpl(std::shared_ptr<executor::ScopedTaskExecutor> executor,
const CancellationToken& token) noexcept = 0;
+ virtual ExecutorFuture<void> _cleanupOnAbort(
+ std::shared_ptr<executor::ScopedTaskExecutor> executor,
+ const CancellationToken& token,
+ const Status& status) noexcept;
+
void interrupt(Status status) override final;
bool _removeDocument(OperationContext* opCtx);
@@ -174,6 +179,8 @@ private:
ExecutorFuture<void> _translateTimeseriesNss(
std::shared_ptr<executor::ScopedTaskExecutor> executor, const CancellationToken& token);
+ virtual boost::optional<Status> getAbortReason() const;
+
Mutex _mutex = MONGO_MAKE_LATCH("ShardingDDLCoordinator::_mutex");
SharedPromise<void> _constructionCompletionPromise;
SharedPromise<void> _completionPromise;
@@ -401,6 +408,37 @@ protected:
osi.setTxnNumber(optSession->getTxnNumber());
return osi;
}
+
+ virtual boost::optional<Status> getAbortReason() const override {
+ const auto& status = _doc.getAbortReason();
+ invariant(!status || !status->isOK(), "when persisted, status must be an error");
+ return status;
+ }
+
+ /**
+ * Persists the abort reason and throws it as an exception. This causes the coordinator to fail,
+ * and triggers the cleanup future chain since there is a the persisted reason.
+ */
+ void triggerCleanup(OperationContext* opCtx, const Status& status) {
+ LOGV2_INFO(7418502,
+ "Coordinator failed, persisting abort reason",
+ "coordinatorId"_attr = _doc.getId(),
+ "phase"_attr = serializePhase(_doc.getPhase()),
+ "reason"_attr = redact(status));
+
+ auto newDoc = [&] {
+ stdx::lock_guard lk{_docMutex};
+ return _doc;
+ }();
+
+ auto coordinatorMetadata = newDoc.getShardingDDLCoordinatorMetadata();
+ coordinatorMetadata.setAbortReason(status);
+ newDoc.setShardingDDLCoordinatorMetadata(std::move(coordinatorMetadata));
+
+ _updateStateDocument(opCtx, std::move(newDoc));
+
+ uassertStatusOK(status);
+ }
};
#undef MONGO_LOGV2_DEFAULT_COMPONENT
diff --git a/src/mongo/db/s/sharding_ddl_coordinator.idl b/src/mongo/db/s/sharding_ddl_coordinator.idl
index 26d5702ec1e..e8336c5c0dc 100644
--- a/src/mongo/db/s/sharding_ddl_coordinator.idl
+++ b/src/mongo/db/s/sharding_ddl_coordinator.idl
@@ -32,6 +32,7 @@ global:
cpp_namespace: "mongo"
cpp_includes:
- "mongo/db/s/forwardable_operation_metadata.h"
+ - "mongo/db/s/sharding_ddl_util.h"
- "mongo/s/database_version.h"
imports:
@@ -75,6 +76,7 @@ types:
bson_serialization_type: object
serializer: "mongo::ForwardableOperationMetadata::toBSON"
deserializer: "mongo::ForwardableOperationMetadata"
+
DatabaseVersion:
description: ""
cpp_type: DatabaseVersion
@@ -82,6 +84,13 @@ types:
serializer: "mongo::DatabaseVersion::toBSON"
deserializer: "mongo::DatabaseVersion"
+ ErrorStatus:
+ bson_serialization_type: any
+ description: "Status representing an error state"
+ cpp_type: "::mongo::Status"
+ serializer: "::mongo::sharding_ddl_util_serializeErrorStatusToBSON"
+ deserializer: "::mongo::sharding_ddl_util_deserializeErrorStatusFromBSON"
+
structs:
ShardingDDLCoordinatorId:
description: "Identifier for a specific sharding DDL Coordinator."
@@ -129,3 +138,7 @@ structs:
description: "In case of sharded timeseries this variable will hold the underlying bucket namespace"
type: namespacestring
optional: true
+ abortReason:
+ description: "Error triggering the cleanup pipeline"
+ type: ErrorStatus
+ optional: true
diff --git a/src/mongo/db/s/sharding_ddl_util.cpp b/src/mongo/db/s/sharding_ddl_util.cpp
index deca970d22b..6bb773a1b7c 100644
--- a/src/mongo/db/s/sharding_ddl_util.cpp
+++ b/src/mongo/db/s/sharding_ddl_util.cpp
@@ -30,6 +30,7 @@
#include "mongo/db/s/sharding_ddl_util.h"
+#include "mongo/bson/util/bson_extract.h"
#include "mongo/db/catalog/collection_catalog.h"
#include "mongo/db/cluster_transaction_api.h"
#include "mongo/db/commands/feature_compatibility_version.h"
@@ -66,6 +67,44 @@
namespace mongo {
+
+static const size_t kSerializedErrorStatusMaxSize = 1024 * 2;
+
+void sharding_ddl_util_serializeErrorStatusToBSON(const Status& status,
+ StringData fieldName,
+ BSONObjBuilder* bsonBuilder) {
+ uassert(7418500, "Status must be an error", !status.isOK());
+
+ BSONObjBuilder tmpBuilder;
+ status.serialize(&tmpBuilder);
+
+ if (status != ErrorCodes::TruncatedSerialization &&
+ (size_t)tmpBuilder.asTempObj().objsize() > kSerializedErrorStatusMaxSize) {
+ const auto statusStr = status.toString();
+ const auto truncatedStatusStr =
+ str::UTF8SafeTruncation(statusStr, kSerializedErrorStatusMaxSize);
+ const Status truncatedStatus{ErrorCodes::TruncatedSerialization, truncatedStatusStr};
+
+ tmpBuilder.resetToEmpty();
+ truncatedStatus.serializeErrorToBSON(&tmpBuilder);
+ }
+
+ bsonBuilder->append(fieldName, tmpBuilder.obj());
+}
+
+Status sharding_ddl_util_deserializeErrorStatusFromBSON(const BSONElement& bsonElem) {
+ const auto& bsonObj = bsonElem.Obj();
+
+ long long code;
+ uassertStatusOK(bsonExtractIntegerField(bsonObj, "code", &code));
+ uassert(7418501, "Status must be an error", code != ErrorCodes::OK);
+
+ std::string errmsg;
+ uassertStatusOK(bsonExtractStringField(bsonObj, "errmsg", &errmsg));
+
+ return {ErrorCodes::Error(code), errmsg, bsonObj};
+}
+
namespace sharding_ddl_util {
namespace {
@@ -872,5 +911,6 @@ BSONObj getCriticalSectionReasonForRename(const NamespaceString& from, const Nam
<< "rename"
<< "from" << from.toString() << "to" << to.toString());
}
+
} // namespace sharding_ddl_util
} // namespace mongo
diff --git a/src/mongo/db/s/sharding_ddl_util.h b/src/mongo/db/s/sharding_ddl_util.h
index cc800b04632..988d2bb571a 100644
--- a/src/mongo/db/s/sharding_ddl_util.h
+++ b/src/mongo/db/s/sharding_ddl_util.h
@@ -40,6 +40,14 @@
#include "mongo/s/request_types/sharded_ddl_commands_gen.h"
namespace mongo {
+
+// TODO (SERVER-74481): Define these functions in the nested `sharding_ddl_util` namespace when the
+// IDL compiler will support the use case.
+void sharding_ddl_util_serializeErrorStatusToBSON(const Status& status,
+ StringData fieldName,
+ BSONObjBuilder* bsonBuilder);
+Status sharding_ddl_util_deserializeErrorStatusFromBSON(const BSONElement& bsonElem);
+
namespace sharding_ddl_util {
/**
diff --git a/src/mongo/db/s/sharding_ddl_util_test.cpp b/src/mongo/db/s/sharding_ddl_util_test.cpp
index 7b373dfb284..ee4c94640e6 100644
--- a/src/mongo/db/s/sharding_ddl_util_test.cpp
+++ b/src/mongo/db/s/sharding_ddl_util_test.cpp
@@ -103,6 +103,98 @@ void findN(DBClientBase& client,
}
}
+TEST_F(ShardingDDLUtilTest, SerializeDeserializeErrorStatusWithoutExtraInfo) {
+ const Status sample{ErrorCodes::ForTestingOptionalErrorExtraInfo, "Dummy reason"};
+
+ BSONObjBuilder bsonBuilder;
+ sharding_ddl_util_serializeErrorStatusToBSON(sample, "status", &bsonBuilder);
+ const auto serialized = bsonBuilder.done();
+
+ const auto deserialized =
+ sharding_ddl_util_deserializeErrorStatusFromBSON(serialized.firstElement());
+
+ ASSERT_EQ(sample.code(), deserialized.code());
+ ASSERT_EQ(sample.reason(), deserialized.reason());
+ ASSERT(!deserialized.extraInfo());
+}
+
+TEST_F(ShardingDDLUtilTest, SerializeDeserializeErrorStatusWithExtraInfo) {
+ OptionalErrorExtraInfoExample::EnableParserForTest whenInScope;
+
+ const Status sample{
+ ErrorCodes::ForTestingOptionalErrorExtraInfo, "Dummy reason", fromjson("{data: 123}")};
+
+ BSONObjBuilder bsonBuilder;
+ sharding_ddl_util_serializeErrorStatusToBSON(sample, "status", &bsonBuilder);
+ const auto serialized = bsonBuilder.done();
+
+ const auto deserialized =
+ sharding_ddl_util_deserializeErrorStatusFromBSON(serialized.firstElement());
+
+ ASSERT_EQ(sample.code(), deserialized.code());
+ ASSERT_EQ(sample.reason(), deserialized.reason());
+ ASSERT(deserialized.extraInfo());
+ ASSERT(deserialized.extraInfo<OptionalErrorExtraInfoExample>());
+ ASSERT_EQ(deserialized.extraInfo<OptionalErrorExtraInfoExample>()->data, 123);
+}
+
+TEST_F(ShardingDDLUtilTest, SerializeDeserializeErrorStatusInvalid) {
+ BSONObjBuilder bsonBuilder;
+ ASSERT_THROWS_CODE(
+ sharding_ddl_util_serializeErrorStatusToBSON(Status::OK(), "status", &bsonBuilder),
+ DBException,
+ 7418500);
+
+ const auto okStatusBSON =
+ BSON("status" << BSON("code" << ErrorCodes::OK << "codeName"
+ << ErrorCodes::errorString(ErrorCodes::OK)));
+ ASSERT_THROWS_CODE(
+ sharding_ddl_util_deserializeErrorStatusFromBSON(okStatusBSON.firstElement()),
+ DBException,
+ 7418501);
+}
+
+TEST_F(ShardingDDLUtilTest, SerializeErrorStatusTooBig) {
+ const std::string longReason(1024 * 3, 'x');
+ const Status sample{ErrorCodes::ForTestingOptionalErrorExtraInfo, longReason};
+
+ BSONObjBuilder bsonBuilder;
+ sharding_ddl_util_serializeErrorStatusToBSON(sample, "status", &bsonBuilder);
+ const auto serialized = bsonBuilder.done();
+
+ const auto deserialized =
+ sharding_ddl_util_deserializeErrorStatusFromBSON(serialized.firstElement());
+
+ ASSERT_EQ(ErrorCodes::TruncatedSerialization, deserialized.code());
+ ASSERT_EQ(
+ "ForTestingOptionalErrorExtraInfo: "
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
+ deserialized.reason());
+ ASSERT(!deserialized.extraInfo());
+}
+
// Test that config.collection document and config.chunks documents are properly updated from source
// to destination collection metadata
TEST_F(ShardingDDLUtilTest, ShardedRenameMetadata) {