diff options
author | Rachita Dhawan <rachita.dhawan@gmail.com> | 2022-05-04 19:10:44 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-05-17 15:48:13 +0000 |
commit | ce47119ff1a1ab79f18235f1f42eeef39da6f20c (patch) | |
tree | dd34fbdfb6a65fe2378fa7166654c0893ad8db49 | |
parent | 387db2d0f7b26b4ae6efdf46854e60f12f93d4d2 (diff) | |
download | mongo-ce47119ff1a1ab79f18235f1f42eeef39da6f20c.tar.gz |
SERVER-62175 Mongos fails to attach RetryableWrite Error Label For Command Interrupted In _parseCommandr6.0.0-rc6
(cherry picked from commit 38e4a5516d614a2bd1f3c3afde97c19068fd1441)
-rw-r--r-- | jstests/sharding/mongos_insert_fails_with_shutdown.js | 59 | ||||
-rw-r--r-- | src/mongo/s/commands/strategy.cpp | 110 | ||||
-rw-r--r-- | src/mongo/s/mongos_main.cpp | 6 |
3 files changed, 130 insertions, 45 deletions
diff --git a/jstests/sharding/mongos_insert_fails_with_shutdown.js b/jstests/sharding/mongos_insert_fails_with_shutdown.js new file mode 100644 index 00000000000..df02f6f54e6 --- /dev/null +++ b/jstests/sharding/mongos_insert_fails_with_shutdown.js @@ -0,0 +1,59 @@ +/** +@tags: [multiversion_incompatible] +*/ + +// Don't check for UUID index consistency,orphans and routine table across the cluster at the end, +// since the test shuts down a mongos +TestData.skipCheckingUUIDsConsistentAcrossCluster = true; +TestData.skipCheckingIndexesConsistentAcrossCluster = true; +TestData.skipCheckOrphans = true; +TestData.skipCheckRoutingTableConsistency = true; +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load('jstests/libs/parallelTester.js'); +load("jstests/libs/retryable_writes_util.js"); +const st = new ShardingTest({ + mongos: 1, + config: 1, + shards: 2, +}); +const dbName = "test"; +const collName = "mycoll"; +const fpData = { + "cmdName": "insert", + "ns": dbName + '.' + collName +}; +const hangBeforeCheckInterruptFailPoint = + configureFailPoint(st.s, "hangBeforeCheckingMongosShutdownInterrupt", fpData); + +const insertThread = new Thread(function insertDoc(host, dbName, collName) { + var lsid = UUID(); + const conn = new Mongo(host); + const retrySession = conn.startSession({retryWrites: true}); + const retrySessionDB = retrySession.getDatabase(dbName); + + try { + var res = assert.commandWorked(retrySessionDB.runCommand({ + insert: 'mycoll', + documents: [{x: 0}, {x: 1}], + ordered: true, + lsid: {id: lsid}, + txnNumber: NumberLong(1) + })); + + } catch (e) { + assert.eq(e.errorLabels, ["RetryableWriteError"], e); + } + retrySession.endSession(); +}, st.s.host, dbName, collName); + +insertThread.start(); +hangBeforeCheckInterruptFailPoint.wait(); + +st.stopMongos(0); +insertThread.join(); + +st.stop(); +})(); diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp index c3ade949332..777d24bf257 100644 --- a/src/mongo/s/commands/strategy.cpp +++ b/src/mongo/s/commands/strategy.cpp @@ -97,10 +97,9 @@ #include "mongo/util/scopeguard.h" #include "mongo/util/str.h" #include "mongo/util/timer.h" - namespace mongo { namespace { - +MONGO_FAIL_POINT_DEFINE(hangBeforeCheckingMongosShutdownInterrupt); const auto kOperationTime = "operationTime"_sd; /** @@ -402,6 +401,9 @@ private: // invocation and extracting read/write concerns). void _parseCommand(); + // updates statistics and applies labels if an error occurs. + void _updateStatsAndApplyErrorLabels(const Status& status); + const std::shared_ptr<RequestExecutionContext> _rec; const std::shared_ptr<BSONObjBuilder> _errorBuilder; const NetworkOp _opType; @@ -437,9 +439,6 @@ public: private: Status _setup(); - // Logs and updates statistics if an error occurs. - void _tapOnError(const Status& status); - ParseAndRunCommand* const _parc; boost::optional<RouterOperationContextSession> _routerSession; @@ -482,7 +481,33 @@ private: int _tries = 0; }; +void ParseAndRunCommand::_updateStatsAndApplyErrorLabels(const Status& status) { + auto opCtx = _rec->getOpCtx(); + const auto command = _rec->getCommand(); + + if (command) + command->incrementCommandsFailed(); + NotPrimaryErrorTracker::get(opCtx->getClient()).recordError(status.code()); + // WriteConcern error (wcCode) is set to boost::none because: + // 1. TransientTransaction error label handling for commitTransaction command in mongos is + // delegated to the shards. Mongos simply propagates the shard's response up to the client. + // 2. For other commands in a transaction, they shouldn't get a writeConcern error so this + // setting doesn't apply. + if (_osi.has_value()) { + + auto errorLabels = getErrorLabels(opCtx, + *_osi, + command->getName(), + status.code(), + boost::none, + false /* isInternalClient */, + true /* isMongos */); + + + _errorBuilder->appendElements(errorLabels); + } +} void ParseAndRunCommand::_parseCommand() { auto opCtx = _rec->getOpCtx(); const auto& m = _rec->getMessage(); @@ -526,12 +551,6 @@ void ParseAndRunCommand::_parseCommand() { uassert(ErrorCodes::InvalidOptions, "no such command option $maxTimeMs; use maxTimeMS instead", request.body[query_request_helper::queryOptionMaxTimeMS].eoo()); - const int maxTimeMS = - uassertStatusOK(parseMaxTimeMS(request.body[query_request_helper::cmdOptionMaxTimeMS])); - if (maxTimeMS > 0 && command->getLogicalOp() != LogicalOp::opGetMore) { - opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMS}, ErrorCodes::MaxTimeMSExpired); - } - opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // If the command includes a 'comment' field, set it on the current OpCtx. if (auto commentField = request.body["comment"]) { @@ -597,6 +616,27 @@ Status ParseAndRunCommand::RunInvocation::_setup() { const auto& request = _parc->_rec->getRequest(); auto replyBuilder = _parc->_rec->getReplyBuilder(); + const int maxTimeMS = + uassertStatusOK(parseMaxTimeMS(request.body[query_request_helper::cmdOptionMaxTimeMS])); + if (maxTimeMS > 0 && command->getLogicalOp() != LogicalOp::opGetMore) { + opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMS}, ErrorCodes::MaxTimeMSExpired); + } + + if (MONGO_unlikely( + hangBeforeCheckingMongosShutdownInterrupt.shouldFail([&](const BSONObj& data) { + if (data.hasField("cmdName") && data.hasField("ns")) { + std::string cmdNS = _parc->_ns.get(); + return ((data.getStringField("cmdName") == _parc->_commandName) && + (data.getStringField("ns") == cmdNS)); + } + return false; + }))) { + + LOGV2(6217501, "Hanging before hangBeforeCheckingMongosShutdownInterrupt is cancelled"); + hangBeforeCheckingMongosShutdownInterrupt.pauseWhileSet(); + } + opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. + auto appendStatusToReplyAndSkipCommandExecution = [replyBuilder](Status status) -> Status { auto responseBuilder = replyBuilder->getBodyBuilder(); CommandHelpers::appendCommandStatusNoThrow(responseBuilder, status); @@ -871,8 +911,8 @@ void ParseAndRunCommand::RunAndRetry::_setup() { auto& readConcernArgs = repl::ReadConcernArgs::get(opCtx); if (_tries > 1) { - // Re-parse before retrying in case the process of run()-ning the invocation could affect - // the parsed result. + // Re-parse before retrying in case the process of run()-ning the invocation could + // affect the parsed result. _parc->_invocation = command->parse(opCtx, request); invariant(_parc->_invocation->ns().toString() == _parc->_ns, "unexpected change of namespace when retrying"); @@ -911,15 +951,15 @@ Future<void> ParseAndRunCommand::RunAndRetry::_run() { } void ParseAndRunCommand::RunAndRetry::_checkRetryForTransaction(Status& status) { - // Retry logic specific to transactions. Throws and aborts the transaction if the error cannot - // be retried on. + // Retry logic specific to transactions. Throws and aborts the transaction if the error + // cannot be retried on. auto opCtx = _parc->_rec->getOpCtx(); auto txnRouter = TransactionRouter::get(opCtx); if (!txnRouter) { if (opCtx->inMultiDocumentTransaction()) { - // This command must have failed while its session was yielded. We cannot retry in this - // case, whatever the session was yielded to is responsible for that, so rethrow the - // error. + // This command must have failed while its session was yielded. We cannot retry in + // this case, whatever the session was yielded to is responsible for that, so + // rethrow the error. iassert(status); } @@ -1059,30 +1099,10 @@ void ParseAndRunCommand::RunAndRetry::_onTenantMigrationAborted(Status& status) iassert(status); } -void ParseAndRunCommand::RunInvocation::_tapOnError(const Status& status) { - auto opCtx = _parc->_rec->getOpCtx(); - const auto command = _parc->_rec->getCommand(); - - command->incrementCommandsFailed(); - NotPrimaryErrorTracker::get(opCtx->getClient()).recordError(status.code()); - // WriteConcern error (wcCode) is set to boost::none because: - // 1. TransientTransaction error label handling for commitTransaction command in mongos is - // delegated to the shards. Mongos simply propagates the shard's response up to the client. - // 2. For other commands in a transaction, they shouldn't get a writeConcern error so this - // setting doesn't apply. - auto errorLabels = getErrorLabels(opCtx, - *_parc->_osi, - command->getName(), - status.code(), - boost::none, - false /* isInternalClient */, - true /* isMongos */); - _parc->_errorBuilder->appendElements(errorLabels); -} - Future<void> ParseAndRunCommand::RunAndRetry::run() { return makeReadyFutureWith([&] { - // Try kMaxNumStaleVersionRetries times. On the last try, exceptions are rethrown. + // Try kMaxNumStaleVersionRetries times. On the last try, exceptions are + // rethrown. _tries++; _setup(); @@ -1116,11 +1136,10 @@ Future<void> ParseAndRunCommand::RunAndRetry::run() { Future<void> ParseAndRunCommand::RunInvocation::run() { return makeReadyFutureWith([&] { - iassert(_setup()); - return future_util::makeState<RunAndRetry>(_parc).thenWithState( - [](auto* runner) { return runner->run(); }); - }) - .tapError([this](Status status) { _tapOnError(status); }); + iassert(_setup()); + return future_util::makeState<RunAndRetry>(_parc).thenWithState( + [](auto* runner) { return runner->run(); }); + }); } Future<void> ParseAndRunCommand::run() { @@ -1129,6 +1148,7 @@ Future<void> ParseAndRunCommand::run() { return future_util::makeState<RunInvocation>(this).thenWithState( [](auto* runner) { return runner->run(); }); }) + .tapError([this](Status status) { _updateStatsAndApplyErrorLabels(status); }) .onError<ErrorCodes::SkipCommandExecution>([this](Status status) { // We've already skipped execution, so no other action is required. return Status::OK(); diff --git a/src/mongo/s/mongos_main.cpp b/src/mongo/s/mongos_main.cpp index bea3ea0ae4b..0504b714aec 100644 --- a/src/mongo/s/mongos_main.cpp +++ b/src/mongo/s/mongos_main.cpp @@ -321,6 +321,12 @@ void cleanupTask(const ShutdownTaskArgs& shutdownArgs) { LOGV2(4701800, "pauseWhileKillingOperationsAtShutdown failpoint enabled"); sleepsecs(1); } + FailPoint* hangBeforeInterruptfailPoint = + globalFailPointRegistry().find("hangBeforeCheckingMongosShutdownInterrupt"); + if (hangBeforeInterruptfailPoint) { + hangBeforeInterruptfailPoint->setMode(FailPoint::Mode::off); + sleepsecs(3); + } } // Perform all shutdown operations after setKillAllOperations is called in order to ensure |