diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2019-07-10 18:37:46 +0200 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2019-07-14 08:14:07 -0400 |
commit | 903207938dc05f9e3f4ca546232d8a7ceda99e4c (patch) | |
tree | 1afc0550abe5a044e74e3fce1e4a48a4b98b286a /src | |
parent | e6644474d876eb99579101e81d38c363feef07cd (diff) | |
download | mongo-903207938dc05f9e3f4ca546232d8a7ceda99e4c.tar.gz |
SERVER-41204 Output the transaction abort reason in the slow log line
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/base/status.cpp | 11 | ||||
-rw-r--r-- | src/mongo/base/status.h | 6 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator.h | 1 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_document.idl | 12 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_structures.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_structures.h | 9 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_structures_test.cpp | 28 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_test.cpp | 69 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_util.cpp | 119 | ||||
-rw-r--r-- | src/mongo/db/s/transaction_coordinator_util.h | 7 |
11 files changed, 204 insertions, 76 deletions
diff --git a/src/mongo/base/status.cpp b/src/mongo/base/status.cpp index 696ff79e5e0..ae4235c0bc3 100644 --- a/src/mongo/base/status.cpp +++ b/src/mongo/base/status.cpp @@ -143,4 +143,15 @@ std::string Status::toString() const { return sb.str(); } +void Status::serializeErrorToBSON(BSONObjBuilder* builder) const { + invariant(!isOK()); + + builder->append("code", code()); + builder->append("codeName", ErrorCodes::errorString(code())); + builder->append("errmsg", reason()); + + if (auto ei = extraInfo()) + ei->serialize(builder); +} + } // namespace mongo diff --git a/src/mongo/base/status.h b/src/mongo/base/status.h index d3faa8fd0ec..c31ef5e83bb 100644 --- a/src/mongo/base/status.h +++ b/src/mongo/base/status.h @@ -194,6 +194,12 @@ public: std::string toString() const; /** + * May only be called if the status is *not OK*. Serializes the code, code name and reason in + * the canonical code/codeName/errmsg format used in the server command responses. + */ + void serializeErrorToBSON(BSONObjBuilder* builder) const; + + /** * Returns true if this Status's code is a member of the given category. */ template <ErrorCategory category> diff --git a/src/mongo/db/s/transaction_coordinator.cpp b/src/mongo/db/s/transaction_coordinator.cpp index e189f75cd33..e9c59b0b17b 100644 --- a/src/mongo/db/s/transaction_coordinator.cpp +++ b/src/mongo/db/s/transaction_coordinator.cpp @@ -187,11 +187,7 @@ TransactionCoordinator::TransactionCoordinator(ServiceContext* serviceContext, return Future<void>::makeReady(); } - return txn::persistDecision(*_scheduler, - _lsid, - _txnNumber, - *_participants, - _decision->getCommitTimestamp()) + return txn::persistDecision(*_scheduler, _lsid, _txnNumber, *_participants, *_decision) .then([this] { stdx::lock_guard<stdx::mutex> lg(_mutex); _decisionDurable = true; @@ -264,6 +260,7 @@ TransactionCoordinator::TransactionCoordinator(ServiceContext* serviceContext, } TransactionCoordinator::~TransactionCoordinator() { + invariant(_completionPromisesFired); invariant(_completionPromises.empty()); } @@ -399,7 +396,7 @@ std::string TransactionCoordinator::_twoPhaseCommitInfoForLog( break; case txn::CommitDecision::kAbort: s << ", terminationCause:aborted"; - // TODO: abortCause, abortSource + s << ", terminationDetails: " << *decision.getAbortStatus(); break; default: MONGO_UNREACHABLE; diff --git a/src/mongo/db/s/transaction_coordinator.h b/src/mongo/db/s/transaction_coordinator.h index a482628a897..2bfb54e74ba 100644 --- a/src/mongo/db/s/transaction_coordinator.h +++ b/src/mongo/db/s/transaction_coordinator.h @@ -160,6 +160,7 @@ private: // Protects the state below mutable stdx::mutex _mutex; + // Tracks which step of the 2PC coordination is currently (or was most recently) executing Step _step{Step::kInactive}; // Promise/future pair which will be signaled when the coordinator has completed diff --git a/src/mongo/db/s/transaction_coordinator_document.idl b/src/mongo/db/s/transaction_coordinator_document.idl index 78dadac283b..39f1bd0bef3 100644 --- a/src/mongo/db/s/transaction_coordinator_document.idl +++ b/src/mongo/db/s/transaction_coordinator_document.idl @@ -45,6 +45,13 @@ types: serializer: "::mongo::txn::writeCommitDecisionEnumProperty" deserializer: "::mongo::txn::readCommitDecisionEnumProperty" + status: + bson_serialization_type: any + description: "Serializes/deserializes the general server Status type" + cpp_type: "::mongo::Status" + serializer: "::mongo::txn::writeStatusProperty" + deserializer: "::mongo::txn::readStatusProperty" + structs: CoordinatorCommitDecision: description: "An object representing the coordinator's commit decision." @@ -59,6 +66,11 @@ structs: type: timestamp description: "If the decision is 'commit', contains the chosen commit timestamp, otherwise it will not be set" + abortStatus: + optional: true + type: status + description: "If the decision is 'abort', contains the reason the shard aborted, + otherwise it will not be set" TransactionCoordinatorDocument: description: "A document used for majority confirming the coordinator's state changes" diff --git a/src/mongo/db/s/transaction_coordinator_structures.cpp b/src/mongo/db/s/transaction_coordinator_structures.cpp index 2088ff6d5c7..65e9a840f99 100644 --- a/src/mongo/db/s/transaction_coordinator_structures.cpp +++ b/src/mongo/db/s/transaction_coordinator_structures.cpp @@ -75,5 +75,14 @@ logger::LogstreamBuilder& operator<<(logger::LogstreamBuilder& stream, MONGO_UNREACHABLE; } +Status readStatusProperty(const BSONElement& statusBSON) { + return Status(ErrorCodes::Error(statusBSON["code"].Int()), statusBSON["errmsg"].String()); +} + +void writeStatusProperty(const Status& status, StringData fieldName, BSONObjBuilder* builder) { + BSONObjBuilder statusBuilder(builder->subobjStart(fieldName)); + status.serializeErrorToBSON(&statusBuilder); +} + } // namespace txn } // namespace mongo diff --git a/src/mongo/db/s/transaction_coordinator_structures.h b/src/mongo/db/s/transaction_coordinator_structures.h index 76f7890990d..13462d2a273 100644 --- a/src/mongo/db/s/transaction_coordinator_structures.h +++ b/src/mongo/db/s/transaction_coordinator_structures.h @@ -33,6 +33,7 @@ #include <vector> #include "mongo/bson/bsonobj.h" +#include "mongo/bson/bsonobjbuilder.h" #include "mongo/logger/logstream_builder.h" #include "mongo/s/shard_id.h" @@ -49,7 +50,7 @@ enum class PrepareVote { using CommitDecision = PrepareVote; /** - * String serializer/deserializer for the commit decision property values. + * String serializer/deserializer for the commit decision enum values. */ CommitDecision readCommitDecisionEnumProperty(StringData decision); StringData writeCommitDecisionEnumProperty(CommitDecision decision); @@ -57,5 +58,11 @@ StringData writeCommitDecisionEnumProperty(CommitDecision decision); logger::LogstreamBuilder& operator<<(logger::LogstreamBuilder& stream, const CommitDecision& decision); +/** + * Optional serializer/deserializer for the generic server 'Status' type. + */ +Status readStatusProperty(const BSONElement& statusBSON); +void writeStatusProperty(const Status& status, StringData fieldName, BSONObjBuilder* builder); + } // namespace txn } // namespace mongo diff --git a/src/mongo/db/s/transaction_coordinator_structures_test.cpp b/src/mongo/db/s/transaction_coordinator_structures_test.cpp index 6ccc636a222..f29b442559b 100644 --- a/src/mongo/db/s/transaction_coordinator_structures_test.cpp +++ b/src/mongo/db/s/transaction_coordinator_structures_test.cpp @@ -36,15 +36,39 @@ namespace mongo { namespace txn { namespace { -TEST(CoordinatorCommitDecisionTest, SerializeNoCommitTimestamp) { +TEST(CoordinatorCommitDecisionTest, SerializeCommitHasTimestampAndNoAbortStatus) { CoordinatorCommitDecision decision(CommitDecision::kCommit); + decision.setCommitTimestamp(Timestamp(100, 200)); + auto obj = decision.toBSON(); ASSERT_BSONOBJ_EQ(BSON("decision" - << "commit"), + << "commit" + << "commitTimestamp" + << Timestamp(100, 200)), obj); } +TEST(CoordinatorCommitDecisionTest, SerializeAbortHasNoTimestampAndAbortStatus) { + CoordinatorCommitDecision decision(CommitDecision::kAbort); + decision.setAbortStatus(Status(ErrorCodes::InternalError, "Test error")); + + auto obj = decision.toBSON(); + auto expectedObj = BSON("decision" + << "abort" + << "abortStatus" + << BSON("code" << 1 << "codeName" + << "InternalError" + << "errmsg" + << "Test error")); + + ASSERT_BSONOBJ_EQ(expectedObj, obj); + + auto deserializedDecision = + CoordinatorCommitDecision::parse(IDLParserErrorContext("AbortTest"), expectedObj); + ASSERT_BSONOBJ_EQ(obj, deserializedDecision.toBSON()); +} + } // namespace } // namespace txn } // namespace mongo diff --git a/src/mongo/db/s/transaction_coordinator_test.cpp b/src/mongo/db/s/transaction_coordinator_test.cpp index fc5857570cb..ddf0bb96fa6 100644 --- a/src/mongo/db/s/transaction_coordinator_test.cpp +++ b/src/mongo/db/s/transaction_coordinator_test.cpp @@ -54,7 +54,8 @@ using TransactionCoordinatorDocument = txn::TransactionCoordinatorDocument; const Hours kLongFutureTimeout(8); const StatusWith<BSONObj> kNoSuchTransaction = - BSON("ok" << 0 << "code" << ErrorCodes::NoSuchTransaction); + BSON("ok" << 0 << "code" << ErrorCodes::NoSuchTransaction << "errmsg" + << "No such transaction exists"); const StatusWith<BSONObj> kOk = BSON("ok" << 1); const Timestamp kDummyPrepareTimestamp = Timestamp(1, 1); @@ -63,6 +64,7 @@ StatusWith<BSONObj> makePrepareOkResponse(const Timestamp& timestamp) { } const StatusWith<BSONObj> kPrepareOk = makePrepareOkResponse(kDummyPrepareTimestamp); +const StatusWith<BSONObj> kPrepareOkNoTimestamp = BSON("ok" << 1); class TransactionCoordinatorTestBase : public TransactionCoordinatorTestFixture { protected: @@ -274,12 +276,15 @@ TEST_F(TransactionCoordinatorDriverTest, getServiceContext(), aws, kTwoShardIdList[0], makeDummyPrepareCommand(_lsid, _txnNumber)); assertPrepareSentAndRespondWithRetryableError(); - aws.shutdown({ErrorCodes::TransactionCoordinatorReachedAbortDecision, "Retry interrupted"}); + const auto shutdownStatus = + Status{ErrorCodes::TransactionCoordinatorReachedAbortDecision, "Retry interrupted"}; + aws.shutdown(shutdownStatus); advanceClockAndExecuteScheduledTasks(); auto response = future.get(); ASSERT(response.vote == boost::none); ASSERT(response.prepareTimestamp == boost::none); + ASSERT_EQ(shutdownStatus.code(), response.abortReason->code()); } TEST_F(TransactionCoordinatorDriverTest, @@ -307,6 +312,8 @@ TEST_F(TransactionCoordinatorDriverTest, auto response = future.get(); ASSERT(response.vote == txn::PrepareVote::kAbort); ASSERT(response.prepareTimestamp == boost::none); + ASSERT(response.abortReason); + ASSERT_EQ(ErrorCodes::NoSuchTransaction, response.abortReason->code()); } TEST_F(TransactionCoordinatorDriverTest, @@ -321,6 +328,8 @@ TEST_F(TransactionCoordinatorDriverTest, auto response = future.get(); ASSERT(response.vote == txn::PrepareVote::kAbort); ASSERT(response.prepareTimestamp == boost::none); + ASSERT(response.abortReason); + ASSERT_EQ(ErrorCodes::NoSuchTransaction, response.abortReason->code()); } TEST_F(TransactionCoordinatorDriverTest, @@ -332,7 +341,10 @@ TEST_F(TransactionCoordinatorDriverTest, [&](const executor::RemoteCommandRequest& request) { return kPrepareOk; }}); auto decision = future.get().decision(); + ASSERT(decision.getDecision() == txn::CommitDecision::kAbort); + ASSERT(decision.getAbortStatus()); + ASSERT_EQ(ErrorCodes::NoSuchTransaction, decision.getAbortStatus()->code()); } TEST_F(TransactionCoordinatorDriverTest, @@ -345,6 +357,8 @@ TEST_F(TransactionCoordinatorDriverTest, auto decision = future.get().decision(); ASSERT(decision.getDecision() == txn::CommitDecision::kAbort); + ASSERT(decision.getAbortStatus()); + ASSERT_EQ(ErrorCodes::NoSuchTransaction, decision.getAbortStatus()->code()); } TEST_F(TransactionCoordinatorDriverTest, @@ -353,10 +367,12 @@ TEST_F(TransactionCoordinatorDriverTest, auto future = txn::sendPrepare(getServiceContext(), aws, _lsid, _txnNumber, kTwoShardIdList); onCommands({[&](const executor::RemoteCommandRequest& request) { return kNoSuchTransaction; }, - [&](const executor::RemoteCommandRequest& request) { return kPrepareOk; }}); + [&](const executor::RemoteCommandRequest& request) { return kNoSuchTransaction; }}); auto decision = future.get().decision(); ASSERT(decision.getDecision() == txn::CommitDecision::kAbort); + ASSERT(decision.getAbortStatus()); + ASSERT_EQ(ErrorCodes::NoSuchTransaction, decision.getAbortStatus()->code()); } TEST_F(TransactionCoordinatorDriverTest, @@ -372,6 +388,7 @@ TEST_F(TransactionCoordinatorDriverTest, auto decision = future.get().decision(); ASSERT(decision.getDecision() == txn::CommitDecision::kCommit); + ASSERT(!decision.getAbortStatus()); ASSERT_EQ(maxPrepareTimestamp, *decision.getCommitTimestamp()); } @@ -388,6 +405,7 @@ TEST_F(TransactionCoordinatorDriverTest, auto decision = future.get().decision(); ASSERT(decision.getDecision() == txn::CommitDecision::kCommit); + ASSERT(!decision.getAbortStatus()); ASSERT_EQ(maxPrepareTimestamp, *decision.getCommitTimestamp()); } @@ -404,9 +422,27 @@ TEST_F(TransactionCoordinatorDriverTest, auto decision = future.get().decision(); ASSERT(decision.getDecision() == txn::CommitDecision::kCommit); + ASSERT(!decision.getAbortStatus()); ASSERT_EQ(maxPrepareTimestamp, *decision.getCommitTimestamp()); } +TEST_F(TransactionCoordinatorDriverTest, + SendPrepareReturnsAbortDecisionWhenNoPreparedTimestampIsReturned) { + const auto timestamp = Timestamp(1, 1); + + txn::AsyncWorkScheduler aws(getServiceContext()); + auto future = txn::sendPrepare(getServiceContext(), aws, _lsid, _txnNumber, kTwoShardIdList); + + assertPrepareSentAndRespondWithSuccess(timestamp); + assertCommandSentAndRespondWith( + PrepareTransaction::kCommandName, kPrepareOkNoTimestamp, WriteConcernOptions::Majority); + + auto decision = future.get().decision(); + + ASSERT(decision.getDecision() == txn::CommitDecision::kAbort); + ASSERT(decision.getAbortStatus()); + ASSERT_EQ(ErrorCodes::InternalError, decision.getAbortStatus()->code()); +} class TransactionCoordinatorDriverPersistenceTest : public TransactionCoordinatorDriverTest { protected: @@ -465,7 +501,17 @@ protected: TxnNumber txnNumber, const std::vector<ShardId>& participants, const boost::optional<Timestamp>& commitTimestamp) { - txn::persistDecision(*_aws, lsid, txnNumber, participants, commitTimestamp).get(); + txn::persistDecision(*_aws, lsid, txnNumber, participants, [&] { + txn::CoordinatorCommitDecision decision; + if (commitTimestamp) { + decision.setDecision(txn::CommitDecision::kCommit); + decision.setCommitTimestamp(commitTimestamp); + } else { + decision.setDecision(txn::CommitDecision::kAbort); + decision.setAbortStatus(Status(ErrorCodes::NoSuchTransaction, "Test abort status")); + } + return decision; + }()).get(); auto allCoordinatorDocs = txn::readAllCoordinatorDocs(opCtx); ASSERT_EQUALS(allCoordinatorDocs.size(), size_t(1)); @@ -573,8 +619,11 @@ TEST_F(TransactionCoordinatorDriverPersistenceTest, TEST_F(TransactionCoordinatorDriverPersistenceTest, PersistCommitDecisionWhenNoDocumentForTransactionExistsCanBeInterruptedAndReturnsError) { - Future<void> future = txn::persistDecision( - *_aws, _lsid, _txnNumber, _participants, _commitTimestamp /* commit */); + Future<void> future = txn::persistDecision(*_aws, _lsid, _txnNumber, _participants, [&] { + txn::CoordinatorCommitDecision decision(txn::CommitDecision::kCommit); + decision.setCommitTimestamp(_commitTimestamp); + return decision; + }()); _aws->shutdown({ErrorCodes::TransactionCoordinatorSteppingDown, "Shutdown for test"}); ASSERT_THROWS_CODE( @@ -639,7 +688,11 @@ TEST_F(TransactionCoordinatorDriverPersistenceTest, // Delete the document for the first transaction and check that only the second transaction's // document still exists. - txn::persistDecision(*_aws, _lsid, txnNumber1, _participants, boost::none /* abort */).get(); + txn::persistDecision(*_aws, _lsid, txnNumber1, _participants, [&] { + txn::CoordinatorCommitDecision decision(txn::CommitDecision::kAbort); + decision.setAbortStatus(Status(ErrorCodes::NoSuchTransaction, "Test abort error")); + return decision; + }()).get(); txn::deleteCoordinatorDoc(*_aws, _lsid, txnNumber1).get(); allCoordinatorDocs = txn::readAllCoordinatorDocs(operationContext()); @@ -2027,6 +2080,8 @@ TEST_F(TransactionCoordinatorMetricsTest, SlowLogLineIncludesTerminationCauseFor stopCapturingLogMessages(); ASSERT_EQUALS(1, countLogLinesContaining("terminationCause:aborted")); + ASSERT_EQUALS(1, + countLogLinesContaining("terminationDetails: NoSuchTransaction: from shard s2")); } TEST_F(TransactionCoordinatorMetricsTest, SlowLogLineIncludesNumParticipants) { diff --git a/src/mongo/db/s/transaction_coordinator_util.cpp b/src/mongo/db/s/transaction_coordinator_util.cpp index 05b9ff6437b..5baab08bc83 100644 --- a/src/mongo/db/s/transaction_coordinator_util.cpp +++ b/src/mongo/db/s/transaction_coordinator_util.cpp @@ -212,10 +212,11 @@ void PrepareVoteConsensus::registerVote(const PrepareResponse& vote) { if (vote.vote == PrepareVote::kCommit) { ++_numCommitVotes; _maxPrepareTimestamp = std::max(_maxPrepareTimestamp, *vote.prepareTimestamp); - } else if (vote.vote == PrepareVote::kAbort) { - ++_numAbortVotes; } else { - ++_numNoVotes; + vote.vote == PrepareVote::kAbort ? ++_numAbortVotes : ++_numNoVotes; + + if (!_abortStatus) + _abortStatus.emplace(*vote.abortReason); } } @@ -227,7 +228,9 @@ CoordinatorCommitDecision PrepareVoteConsensus::decision() const { decision.setDecision(CommitDecision::kCommit); decision.setCommitTimestamp(_maxPrepareTimestamp); } else { + invariant(_abortStatus); decision.setDecision(CommitDecision::kAbort); + decision.setAbortStatus(*_abortStatus); } return decision; } @@ -287,8 +290,9 @@ void persistDecisionBlocking(OperationContext* opCtx, const LogicalSessionId& lsid, TxnNumber txnNumber, const std::vector<ShardId>& participantList, - const boost::optional<Timestamp>& commitTimestamp) { - LOG(3) << "Going to write decision " << (commitTimestamp ? "commit" : "abort") << " for " + const txn::CoordinatorCommitDecision& decision) { + const bool isCommit = decision.getDecision() == txn::CommitDecision::kCommit; + LOG(3) << "Going to write decision " << (isCommit ? "commit" : "abort") << " for " << lsid.getId() << ':' << txnNumber; if (MONGO_FAIL_POINT(hangBeforeWritingDecision)) { @@ -313,18 +317,9 @@ void persistDecisionBlocking(OperationContext* opCtx, // if an earlier attempt to write the decision failed waiting for writeConcern. BSONObj noDecision = BSON(TransactionCoordinatorDocument::kDecisionFieldName << BSON("$exists" << false)); - BSONObj sameDecision; - if (commitTimestamp) { - sameDecision = BSON(TransactionCoordinatorDocument::kDecisionFieldName - << BSON(TransactionCoordinatorDocument::kDecisionFieldName - << "commit" - << "commitTimestamp" - << *commitTimestamp)); - } else { - sameDecision = - BSON(TransactionCoordinatorDocument::kDecisionFieldName - << BSON(TransactionCoordinatorDocument::kDecisionFieldName << "abort")); - } + BSONObj sameDecision = + BSON(TransactionCoordinatorDocument::kDecisionFieldName << decision.toBSON()); + entry.setQ(BSON(TransactionCoordinatorDocument::kIdFieldName << sessionInfo.toBSON() << "$and" @@ -332,19 +327,13 @@ void persistDecisionBlocking(OperationContext* opCtx, << "$or" << BSON_ARRAY(noDecision << sameDecision))); - // Update with decision. - TransactionCoordinatorDocument doc; - doc.setId(sessionInfo); - doc.setParticipants(std::move(participantList)); - txn::CoordinatorCommitDecision decision; - if (commitTimestamp) { - decision.setDecision(CommitDecision::kCommit); - decision.setCommitTimestamp(commitTimestamp); - } else { - decision.setDecision(CommitDecision::kAbort); - } - doc.setDecision(decision); - entry.setU(doc.toBSON()); + entry.setU([&] { + TransactionCoordinatorDocument doc; + doc.setId(sessionInfo); + doc.setParticipants(std::move(participantList)); + doc.setDecision(decision); + return doc.toBSON(); + }()); return entry; }()}); @@ -361,11 +350,11 @@ void persistDecisionBlocking(OperationContext* opCtx, // exists. Note that this is best-effort: the document may have been deleted or manually // changed since the update above ran. const auto doc = client.findOne( - NamespaceString::kTransactionCoordinatorsNamespace.toString(), + NamespaceString::kTransactionCoordinatorsNamespace.ns(), QUERY(TransactionCoordinatorDocument::kIdFieldName << sessionInfo.toBSON())); uasserted(51026, str::stream() << "While attempting to write decision " - << (commitTimestamp ? "'commit'" : "'abort'") + << (isCommit ? "'commit'" : "'abort'") << " for" << lsid.getId() << ':' @@ -376,8 +365,8 @@ void persistDecisionBlocking(OperationContext* opCtx, << doc); } - LOG(3) << "Wrote decision " << (commitTimestamp ? "commit" : "abort") << " for " << lsid.getId() - << ':' << txnNumber; + LOG(3) << "Wrote decision " << (isCommit ? "commit" : "abort") << " for " << lsid.getId() << ':' + << txnNumber; MONGO_FAIL_POINT_BLOCK(hangBeforeWaitingForDecisionWriteConcern, fp) { LOG(0) << "Hit hangBeforeWaitingForDecisionWriteConcern failpoint"; @@ -403,17 +392,17 @@ Future<void> persistDecision(txn::AsyncWorkScheduler& scheduler, const LogicalSessionId& lsid, TxnNumber txnNumber, const txn::ParticipantsList& participants, - const boost::optional<Timestamp>& commitTimestamp) { - return txn::doWhile( - scheduler, - boost::none /* no need for a backoff */, - [](const Status& s) { return shouldRetryPersistingCoordinatorState(s); }, - [&scheduler, lsid, txnNumber, participants, commitTimestamp] { - return scheduler.scheduleWork( - [lsid, txnNumber, participants, commitTimestamp](OperationContext* opCtx) { - persistDecisionBlocking(opCtx, lsid, txnNumber, participants, commitTimestamp); - }); - }); + const txn::CoordinatorCommitDecision& decision) { + return txn::doWhile(scheduler, + boost::none /* no need for a backoff */, + [](const Status& s) { return shouldRetryPersistingCoordinatorState(s); }, + [&scheduler, lsid, txnNumber, participants, decision] { + return scheduler.scheduleWork( + [lsid, txnNumber, participants, decision](OperationContext* opCtx) { + persistDecisionBlocking( + opCtx, lsid, txnNumber, participants, decision); + }); + }); } Future<void> sendCommit(ServiceContext* service, @@ -596,33 +585,45 @@ Future<PrepareResponse> sendPrepareToShard(ServiceContext* service, auto prepareTimestampField = response.data["prepareTimestamp"]; if (prepareTimestampField.eoo() || prepareTimestampField.timestamp().isNull()) { - LOG(0) << "Coordinator shard received an OK response to " - "prepareTransaction " - "without a prepareTimestamp from shard " - << shardId << ", which is not expected behavior. " - "Interpreting the response from " - << shardId << " as a vote to abort"; - return PrepareResponse{shardId, PrepareVote::kAbort, boost::none}; + Status abortStatus(ErrorCodes::InternalError, + str::stream() + << "Coordinator shard received an OK response " + "to prepareTransaction without a " + "prepareTimestamp from shard " + << shardId + << ", which is not an expected behavior. " + "Interpreting the response as vote to abort"); + LOG(0) << redact(abortStatus); + + return PrepareResponse{ + shardId, PrepareVote::kAbort, boost::none, abortStatus}; } LOG(3) << "Coordinator shard received a vote to commit from shard " << shardId << " with prepareTimestamp: " << prepareTimestampField.timestamp(); - return PrepareResponse{ - shardId, PrepareVote::kCommit, prepareTimestampField.timestamp()}; + + return PrepareResponse{shardId, + PrepareVote::kCommit, + prepareTimestampField.timestamp(), + boost::none}; } LOG(3) << "Coordinator shard received " << status << " from shard " << shardId << " for " << commandObj; if (ErrorCodes::isVoteAbortError(status.code())) { - return PrepareResponse{shardId, PrepareVote::kAbort, boost::none}; + return PrepareResponse{ + shardId, + PrepareVote::kAbort, + boost::none, + status.withContext(str::stream() << "from shard " << shardId)}; } uassertStatusOK(status); MONGO_UNREACHABLE; }) - .onError<ErrorCodes::ShardNotFound>([shardId, isLocalShard](const Status&) { + .onError<ErrorCodes::ShardNotFound>([shardId, isLocalShard](const Status& status) { invariant(!isLocalShard); // ShardNotFound may indicate that the participant shard has been removed (it // could also mean the participant shard was recently added and this node @@ -632,14 +633,14 @@ Future<PrepareResponse> sendPrepareToShard(ServiceContext* service, // treat ShardNotFound as a vote to abort, which is always safe since the node // must then send abort. return Future<PrepareResponse>::makeReady( - {shardId, CommitDecision::kAbort, boost::none}); + {shardId, CommitDecision::kAbort, boost::none, status}); }); }); return std::move(f).onError<ErrorCodes::TransactionCoordinatorReachedAbortDecision>( - [shardId](const Status&) { + [shardId](const Status& status) { LOG(3) << "Prepare stopped retrying due to retrying being cancelled"; - return PrepareResponse{shardId, boost::none, boost::none}; + return PrepareResponse{shardId, boost::none, boost::none, status}; }); } diff --git a/src/mongo/db/s/transaction_coordinator_util.h b/src/mongo/db/s/transaction_coordinator_util.h index 8721c0bd2d3..d6e87bb4baf 100644 --- a/src/mongo/db/s/transaction_coordinator_util.h +++ b/src/mongo/db/s/transaction_coordinator_util.h @@ -79,6 +79,8 @@ private: int _numNoVotes{0}; Timestamp _maxPrepareTimestamp; + + boost::optional<Status> _abortStatus; }; /** @@ -130,7 +132,7 @@ Future<void> persistDecision(txn::AsyncWorkScheduler& scheduler, const LogicalSessionId& lsid, TxnNumber txnNumber, const txn::ParticipantsList& participants, - const boost::optional<Timestamp>& commitTimestamp); + const txn::CoordinatorCommitDecision& decision); /** * Sends commit to all shards and returns a future that will be resolved when all participants have @@ -195,6 +197,9 @@ struct PrepareResponse { // Will only be set if the vote was kCommit boost::optional<Timestamp> prepareTimestamp; + + // Will only be set if the vote was kAbort or no value + boost::optional<Status> abortReason; }; Future<PrepareResponse> sendPrepareToShard(ServiceContext* service, txn::AsyncWorkScheduler& scheduler, |