diff options
author | Jack Mulrow <jack.mulrow@mongodb.com> | 2018-12-05 11:22:28 -0500 |
---|---|---|
committer | Jack Mulrow <jack.mulrow@mongodb.com> | 2018-12-16 14:04:10 -0500 |
commit | 2d94c21d3af7d4b4fc32e6c34791ac24a2a35c0a (patch) | |
tree | 63d4ae1994fcf5849e0974fa7af897f2bf44bf12 /src/mongo | |
parent | 54c2a4560cbcd5b652c3bdc8543404ea6cc2d485 (diff) | |
download | mongo-2d94c21d3af7d4b4fc32e6c34791ac24a2a35c0a.tar.gz |
SERVER-38470 Add more logging to TransactionRouter
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/s/commands/cluster_distinct_cmd.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/query/cluster_aggregate.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/transaction_router.cpp | 49 | ||||
-rw-r--r-- | src/mongo/s/transaction_router.h | 11 | ||||
-rw-r--r-- | src/mongo/s/transaction_router_test.cpp | 8 |
5 files changed, 58 insertions, 14 deletions
diff --git a/src/mongo/s/commands/cluster_distinct_cmd.cpp b/src/mongo/s/commands/cluster_distinct_cmd.cpp index bcc8b9976f0..3e4fe391d73 100644 --- a/src/mongo/s/commands/cluster_distinct_cmd.cpp +++ b/src/mongo/s/commands/cluster_distinct_cmd.cpp @@ -201,7 +201,7 @@ public: auto resolvedAggCmd = resolvedAggRequest.serializeToCommandObj().toBson(); if (auto txnRouter = TransactionRouter::get(opCtx)) { - txnRouter->onViewResolutionError(); + txnRouter->onViewResolutionError(nss); } BSONObj aggResult = CommandHelpers::runCommandDirectly( diff --git a/src/mongo/s/query/cluster_aggregate.cpp b/src/mongo/s/query/cluster_aggregate.cpp index 506842da29c..82d5e5a7966 100644 --- a/src/mongo/s/query/cluster_aggregate.cpp +++ b/src/mongo/s/query/cluster_aggregate.cpp @@ -884,7 +884,7 @@ Status ClusterAggregate::retryOnViewError(OperationContext* opCtx, result->resetToEmpty(); if (auto txnRouter = TransactionRouter::get(opCtx)) { - txnRouter->onViewResolutionError(); + txnRouter->onViewResolutionError(requestedNss); } // We pass both the underlying collection namespace and the view namespace here. The diff --git a/src/mongo/s/transaction_router.cpp b/src/mongo/s/transaction_router.cpp index 2e6e95ce68a..8a6f4623017 100644 --- a/src/mongo/s/transaction_router.cpp +++ b/src/mongo/s/transaction_router.cpp @@ -279,10 +279,13 @@ const boost::optional<ShardId>& TransactionRouter::getCoordinatorId() const { BSONObj TransactionRouter::attachTxnFieldsIfNeeded(const ShardId& shardId, const BSONObj& cmdObj) { if (auto txnPart = getParticipant(shardId)) { + LOG(4) << _txnIdToString() + << " Sending transaction fields to existing participant: " << shardId; return txnPart->attachTxnFieldsIfNeeded(cmdObj, false); } auto txnPart = _createParticipant(shardId); + LOG(4) << _txnIdToString() << " Sending transaction fields to new participant: " << shardId; return txnPart.attachTxnFieldsIfNeeded(cmdObj, true); } @@ -378,14 +381,20 @@ void TransactionRouter::onStaleShardOrDbError(StringData cmdName, const Status& << errorStatus, _canContinueOnStaleShardOrDbError(cmdName)); + LOG(0) << _txnIdToString() + << " Clearing pending participants after stale version error: " << errorStatus; + // Remove participants created during the current statement so they are sent the correct options // if they are targeted again by the retry. _clearPendingParticipants(); } -void TransactionRouter::onViewResolutionError() { +void TransactionRouter::onViewResolutionError(const NamespaceString& nss) { // The router can always retry on a view resolution error. + LOG(0) << _txnIdToString() + << " Clearing pending participants after view resolution error on namespace: " << nss; + // Requests against views are always routed to the primary shard for its database, but the retry // on the resolved namespace does not have to re-target the primary, so pending participants // should be cleared. @@ -404,6 +413,10 @@ void TransactionRouter::onSnapshotError(const Status& errorStatus) { << errorStatus, _canContinueOnSnapshotError()); + LOG(0) << _txnIdToString() << " Clearing pending participants and resetting global snapshot " + "timestamp after snapshot error: " + << errorStatus << ", previous timestamp: " << _atClusterTime->getTime(); + // The transaction must be restarted on all participants because a new read timestamp will be // selected, so clear all pending participants. Snapshot errors are only retryable on the first // client statement, so all participants should be cleared, including the coordinator. @@ -412,7 +425,6 @@ void TransactionRouter::onSnapshotError(const Status& errorStatus) { invariant(!_coordinatorId); // Reset the global snapshot timestamp so the retry will select a new one. - invariant(_atClusterTime); _atClusterTime.reset(); _atClusterTime.emplace(); } @@ -462,6 +474,9 @@ void TransactionRouter::_setAtClusterTime(const boost::optional<LogicalTime>& af return; } + LOG(0) << _txnIdToString() << " Setting global snapshot timestamp to " << candidateTime + << " on statement " << _latestStmtId; + _atClusterTime->setTime(candidateTime, _latestStmtId); } @@ -529,6 +544,8 @@ void TransactionRouter::beginOrContinueTxn(OperationContext* opCtx, if (_readConcernArgs.getLevel() == repl::ReadConcernLevel::kSnapshotReadConcern) { _atClusterTime.emplace(); } + + LOG(0) << _txnIdToString() << " New transaction started"; } const LogicalSessionId& TransactionRouter::_sessionId() const { @@ -546,6 +563,9 @@ Shard::CommandResponse TransactionRouter::_commitSingleShardTransaction(Operatio CommitTransaction commitCmd; commitCmd.setDbName("admin"); + LOG(0) << _txnIdToString() + << " Committing single shard transaction, single participant: " << shardId; + const auto& participant = citer->second; return uassertStatusOK(shard->runCommandWithFixedRetryAttempts( opCtx, @@ -611,6 +631,9 @@ Shard::CommandResponse TransactionRouter::_commitMultiShardTransaction(Operation _initiatedTwoPhaseCommit = true; + LOG(0) << _txnIdToString() + << " Committing multi shard transaction, coordinator: " << *_coordinatorId; + return uassertStatusOK(coordinatorShard->runCommandWithFixedRetryAttempts( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, @@ -633,7 +656,7 @@ Shard::CommandResponse TransactionRouter::commitTransaction(OperationContext* op } std::vector<AsyncRequestsSender::Response> TransactionRouter::abortTransaction( - OperationContext* opCtx) { + OperationContext* opCtx, bool isImplicit) { // The router has yet to send any commands to a remote shard for this transaction. // Return the same error that would have been returned by a shard. uassert(ErrorCodes::NoSuchTransaction, @@ -647,6 +670,14 @@ std::vector<AsyncRequestsSender::Response> TransactionRouter::abortTransaction( abortRequests.emplace_back(ShardId(participantEntry.first), abortCmd); } + if (isImplicit) { + LOG(0) << _txnIdToString() << " Implicitly aborting transaction on " << _participants.size() + << " shard(s)"; + } else { + LOG(0) << _txnIdToString() << " Aborting transaction on " << _participants.size() + << " shard(s)"; + } + return gatherResponses(opCtx, NamespaceString::kAdminDb, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, @@ -660,17 +691,21 @@ void TransactionRouter::implicitlyAbortTransaction(OperationContext* opCtx) { } if (_initiatedTwoPhaseCommit) { - LOG(0) << "Router not sending implicit abortTransaction for transaction " - << *opCtx->getTxnNumber() << " on session " << opCtx->getLogicalSessionId()->toBSON() - << " because already initiated two phase commit for the transaction"; + LOG(0) << _txnIdToString() << " Router not sending implicit abortTransaction because " + "already initiated two phase commit for the transaction"; return; } try { - abortTransaction(opCtx); + abortTransaction(opCtx, true /*isImplicit*/); } catch (...) { // Ignore any exceptions. } } +std::string TransactionRouter::_txnIdToString() { + return str::stream() << "(txnNumber: " << _txnNumber << ", lsid: " << _sessionId().getId() + << ")"; +} + } // namespace mongo diff --git a/src/mongo/s/transaction_router.h b/src/mongo/s/transaction_router.h index 2cdf38bf389..9791ed2e00e 100644 --- a/src/mongo/s/transaction_router.h +++ b/src/mongo/s/transaction_router.h @@ -185,7 +185,7 @@ public: * Updates the transaction tracking state to allow for a retry attempt on a view resolution * error. */ - void onViewResolutionError(); + void onViewResolutionError(const NamespaceString& nss); /** * Computes and sets the atClusterTime for the current transaction based on the given query @@ -233,7 +233,8 @@ public: /** * Sends abort to all participants and returns the responses from all shards. */ - std::vector<AsyncRequestsSender::Response> abortTransaction(OperationContext* opCtx); + std::vector<AsyncRequestsSender::Response> abortTransaction(OperationContext* opCtx, + bool isImplicit = false); /** * Sends abort to all shards in the current participant list. Will retry on retryable errors, @@ -317,6 +318,12 @@ private: */ void _verifyParticipantAtClusterTime(const Participant& participant); + /** + * Returns a string with the active transaction's transaction number and logical session id + * (i.e. the transaction id). + */ + std::string _txnIdToString(); + // The currently active transaction number on this transaction router (i.e. on the session) TxnNumber _txnNumber{kUninitializedTxnNumber}; diff --git a/src/mongo/s/transaction_router_test.cpp b/src/mongo/s/transaction_router_test.cpp index 05c0bcdcb42..95c0e0bd7f1 100644 --- a/src/mongo/s/transaction_router_test.cpp +++ b/src/mongo/s/transaction_router_test.cpp @@ -71,6 +71,8 @@ protected: const Status kDummyStatus = {ErrorCodes::InternalError, "dummy"}; + const NamespaceString kViewNss = NamespaceString("test.foo"); + void setUp() override { ShardingTestFixture::setUp(); configTargeter()->setFindHostReturnValue(kTestConfigShardHost); @@ -1031,7 +1033,7 @@ TEST_F(TransactionRouterTestWithDefaultSession, << "test")); ASSERT_BSONOBJ_EQ(expectedReadConcern, newCmd["readConcern"].Obj()); - txnRouter.onViewResolutionError(); + txnRouter.onViewResolutionError(kViewNss); txnRouter.setDefaultAtClusterTime(operationContext()); newCmd = txnRouter.attachTxnFieldsIfNeeded(shard1, @@ -1184,7 +1186,7 @@ TEST_F(TransactionRouterTestWithDefaultSession, OnViewResolutionErrorClearsAllNe // Simulate a view resolution error on the first client statement that leads to a retry which // targets the same shard. - txnRouter.onViewResolutionError(); + txnRouter.onViewResolutionError(kViewNss); // The only participant was the coordinator, so it should have been reset. ASSERT_FALSE(txnRouter.getCoordinatorId()); @@ -1203,7 +1205,7 @@ TEST_F(TransactionRouterTestWithDefaultSession, OnViewResolutionErrorClearsAllNe // Simulate a view resolution error. - txnRouter.onViewResolutionError(); + txnRouter.onViewResolutionError(kViewNss); // Only the new participant shard was reset. firstShardCmd = txnRouter.attachTxnFieldsIfNeeded(shard1, {}); |