diff options
author | Louis Williams <louis.williams@mongodb.com> | 2022-03-11 10:01:57 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-03-11 10:33:12 +0000 |
commit | 8e616cc3ef082ec67a04393ca3435d4b81f44da4 (patch) | |
tree | 83c571700a891dae5a76db6f43f0d0f7d932b372 | |
parent | 0492ea43ae52b6102e6637b0cd745a2901e6bf06 (diff) | |
download | mongo-8e616cc3ef082ec67a04393ca3435d4b81f44da4.tar.gz |
SERVER-64050 Increase TemporarilyUnavailable retry backoff and enable by default
* Alters the base backoff period to 1 second and max retries to 10.
* Removes the loadShedding server parameter and enables internal
TemporarilyUnavailable retry behavior by default.
* TemporarilyUnavailable errors are converted to WriteConflicts when returned
inside multi-document transactions.
7 files changed, 129 insertions, 48 deletions
diff --git a/jstests/noPassthrough/temporarily_unavailable_error.js b/jstests/noPassthrough/temporarily_unavailable_error.js index 253c08cead3..d70841733c8 100644 --- a/jstests/noPassthrough/temporarily_unavailable_error.js +++ b/jstests/noPassthrough/temporarily_unavailable_error.js @@ -3,20 +3,31 @@ * with TemporarilyUnavailable error. * * @tags: [ - * assumes_against_mongod_not_mongos, - * does_not_support_stepdowns, * // Exclude in-memory engine, rollbacks due to pinned cache content rely on eviction. * requires_journaling, * requires_persistence, + * requires_replication, * requires_wiredtiger, * ] */ (function() { "use strict"; -const mongod = - MongoRunner.runMongod({wiredTigerCacheSizeGB: 0.256, setParameter: "loadShedding=1"}); -const db = mongod.getDB("test"); +const replSet = new ReplSetTest({ + nodes: 1, + nodeOptions: { + wiredTigerCacheSizeGB: 0.256, + setParameter: { + logComponentVerbosity: tojson({control: 1, write: 1}), + // Lower these values from the defaults to speed up the test. + temporarilyUnavailableMaxRetries: 3, + temporarilyUnavailableBackoffBaseMs: 10, + } + }, +}); +replSet.startSet(); +replSet.initiate(); +const db = replSet.getPrimary().getDB("test"); // Generate a workload pinning enough dirty data in cache that causes WiredTiger // to roll back transactions. This workload is adapted from the reproducer in the @@ -26,26 +37,63 @@ let doc = {x: []}; for (let j = 0; j < 50000; j++) doc.x.push("" + Math.random() + Math.random()); -let caughtTUerror = false; -let attempts; -for (attempts = 1; attempts <= 20; attempts++) { - const ret = db.c.insert(doc); +(function temporarilyUnavailableNonTransaction() { + let caughtTUerror = false; + let attempts; + for (attempts = 1; attempts <= 20; attempts++) { + print("temporarilyUnavailableNonTransaction attempt " + attempts); + const ret = db.c.insert(doc); - if (ret["nInserted"] === 1) { - // The write succeeded. - continue; + if (ret["nInserted"] === 1) { + // The write succeeded. + continue; + } + assert.eq(0, ret["nInserted"]); + assert.commandFailedWithCode(ret, ErrorCodes.TemporarilyUnavailable); + caughtTUerror = true; + jsTestLog("returned the expected TemporarilyUnavailable code at attempt " + attempts); + break; } - assert.eq(0, ret["nInserted"]); - assert.commandFailedWithCode(ret, ErrorCodes.TemporarilyUnavailable); - caughtTUerror = true; - jsTestLog("returned the expected TemporarilyUnavailable code at attempt " + attempts); - break; -} - -assert.eq(true, - caughtTUerror, - "did not return the expected TemporarilyUnavailable error after " + (attempts - 1) + - " attempts"); - -MongoRunner.stopMongod(mongod); + + assert(caughtTUerror, + "did not return the expected TemporarilyUnavailable error after " + (attempts - 1) + + " attempts"); +})(); + +(function temporarilyUnavailableInTransactionIsConvertedToWriteConflict() { + // Inside a transaction, TemporarilyUnavailable errors should be converted to + // WriteConflicts and tagged as TransientTransactionErrors. + let caughtWriteConflict = false; + let attempts; + let ret; + for (attempts = 1; attempts <= 20; attempts++) { + print("temporarilyUnavailableInTransactionIsConvertedToWriteConflict attempt " + attempts); + const session = db.getMongo().startSession(); + session.startTransaction(); + const sessionDB = session.getDatabase('test'); + ret = sessionDB.c.insert(doc); + + if (ret["nInserted"] === 1) { + // The write succeeded. + session.commitTransaction(); + continue; + } + assert.commandFailedWithCode( + ret, ErrorCodes.WriteConflict, "Did not get WriteConflict. Result: " + tojson(ret)); + assert(ret.hasOwnProperty("errorLabels"), "missing errorLabels. Result: " + tojson(ret)); + assert.contains("TransientTransactionError", + ret.errorLabels, + "did not find TransientTransaction error label. Result: " + tojson(ret)); + caughtWriteConflict = true; + jsTestLog("returned the expected WriteConflict code at attempt " + attempts); + session.abortTransaction(); + break; + } + + assert(caughtWriteConflict, + "did not return the expected WriteConflict error after " + (attempts - 1) + + " attempts. Result: " + tojson(ret)); +})(); + +replSet.stopSet(); })(); diff --git a/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp b/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp index 29543db0f30..98131fc3140 100644 --- a/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp +++ b/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp @@ -37,10 +37,12 @@ namespace mongo { +// These are initialized by IDL as server parameters. +AtomicWord<long long> TemporarilyUnavailableException::maxRetryAttempts; +AtomicWord<long long> TemporarilyUnavailableException::retryBackoffBaseMs; + TemporarilyUnavailableException::TemporarilyUnavailableException(StringData context) - : DBException(Status(ErrorCodes::TemporarilyUnavailable, context)) { - invariant(gLoadShedding); -} + : DBException(Status(ErrorCodes::TemporarilyUnavailable, context)) {} void TemporarilyUnavailableException::handle(OperationContext* opCtx, int attempts, @@ -49,23 +51,29 @@ void TemporarilyUnavailableException::handle(OperationContext* opCtx, const TemporarilyUnavailableException& e) { opCtx->recoveryUnit()->abandonSnapshot(); if (opCtx->getClient()->isFromUserConnection() && - attempts > TemporarilyUnavailableException::kMaxRetryAttempts) { + attempts > TemporarilyUnavailableException::maxRetryAttempts.load()) { LOGV2_DEBUG(6083901, 1, "Too many TemporarilyUnavailableException's, giving up", + "reason"_attr = e.reason(), "attempts"_attr = attempts, "operation"_attr = opStr, logAttrs(NamespaceString(ns))); throw e; } + + // Back off linearly with the retry attempt number. + auto sleepFor = + Milliseconds(TemporarilyUnavailableException::retryBackoffBaseMs.load()) * attempts; LOGV2_DEBUG(6083900, 1, "Caught TemporarilyUnavailableException", + "reason"_attr = e.reason(), "attempts"_attr = attempts, "operation"_attr = opStr, + "sleepFor"_attr = sleepFor, logAttrs(NamespaceString(ns))); - // Back off linearly with the retry attempt number. - opCtx->sleepFor(TemporarilyUnavailableException::kRetryBackoff * attempts); + opCtx->sleepFor(sleepFor); } } // namespace mongo diff --git a/src/mongo/db/concurrency/temporarily_unavailable_exception.h b/src/mongo/db/concurrency/temporarily_unavailable_exception.h index 20702fe6d79..ecd12c46943 100644 --- a/src/mongo/db/concurrency/temporarily_unavailable_exception.h +++ b/src/mongo/db/concurrency/temporarily_unavailable_exception.h @@ -38,13 +38,14 @@ namespace mongo { /** * This exception is thrown if an operation aborts due to the server being temporarily - * unavailable, e.g. due to excessive load. May only be used when loadShedding is - * enabled. + * unavailable, e.g. due to excessive load. For user-originating operations, this will be retried + * internally by the writeConflictRetry helper a finite number of times before eventually being + * returned. */ class TemporarilyUnavailableException final : public DBException { public: - static constexpr int kMaxRetryAttempts = 3; - static constexpr auto kRetryBackoff = Milliseconds(100); + static AtomicWord<long long> maxRetryAttempts; + static AtomicWord<long long> retryBackoffBaseMs; TemporarilyUnavailableException(StringData context); diff --git a/src/mongo/db/concurrency/write_conflict_exception.h b/src/mongo/db/concurrency/write_conflict_exception.h index 4513b008866..745f63e8e64 100644 --- a/src/mongo/db/concurrency/write_conflict_exception.h +++ b/src/mongo/db/concurrency/write_conflict_exception.h @@ -71,9 +71,12 @@ private: /** * Runs the argument function f as many times as needed for f to complete or throw an exception - * other than WriteConflictException. For each time f throws a WriteConflictException, logs the - * error, waits a spell, cleans up, and then tries f again. Imposes no upper limit on the number - * of times to re-try f, so any required timeout behavior must be enforced within f. + * other than WriteConflictException or TemporarilyUnavailableException. For each time f throws an + * one of these exceptions, logs the error, waits a spell, cleans up, and then tries f again. + * Imposes no upper limit on the number of times to re-try f after a WriteConflictException, so any + * required timeout behavior must be enforced within f. When retrying a + * TemporarilyUnavailableException, f is called a finite number of times before we eventually let + * the error escape. * * If we are already in a WriteUnitOfWork, we assume that we are being called within a * WriteConflictException retry loop up the call stack. Hence, this retry loop is reduced to an @@ -91,7 +94,18 @@ auto writeConflictRetry(OperationContext* opCtx, StringData opStr, StringData ns bool userSkipWriteConflictRetry = MONGO_unlikely(skipWriteConflictRetries.shouldFail()) && opCtx->getClient()->isFromUserConnection(); if (opCtx->lockState()->inAWriteUnitOfWork() || userSkipWriteConflictRetry) { - return f(); + try { + return f(); + } catch (TemporarilyUnavailableException const& e) { + if (opCtx->inMultiDocumentTransaction()) { + // Since WriteConflicts are tagged as TransientTransactionErrors and + // TemporarilyUnavailable errors are not, we convert the error to a WriteConflict to + // allow users of multi-document transactions to retry without changing any + // behavior. Otherwise, we let the error escape as usual. + throw WriteConflictException(e.reason()); + } + throw; + } } int attempts = 0; diff --git a/src/mongo/db/concurrency/write_conflict_exception.idl b/src/mongo/db/concurrency/write_conflict_exception.idl index 0f80fa37d61..c1ec8cb57e5 100644 --- a/src/mongo/db/concurrency/write_conflict_exception.idl +++ b/src/mongo/db/concurrency/write_conflict_exception.idl @@ -35,3 +35,21 @@ server_parameters: description: 'Call printStackTrace on every WriteConflictException created' set_at: [ startup, runtime ] cpp_varname: 'WriteConflictException::trace' + + temporarilyUnavailableMaxRetries: + description: 'The number of times to retry a TemporarilyUnavailable error internally' + set_at: [ startup, runtime ] + cpp_varname: 'TemporarilyUnavailableException::maxRetryAttempts' + default: 10 + validator: + gte: 0 + + temporarilyUnavailableBackoffBaseMs: + description: 'The base period of time to wait between each TemporarilyUnavailable retry + attempt. The backoff time is linear such that the Nth retry waits for N times + the base backoff period.' + set_at: [ startup, runtime ] + cpp_varname: 'TemporarilyUnavailableException::retryBackoffBaseMs' + default: 1000 + validator: + gte: 0 diff --git a/src/mongo/db/server_options_general.idl b/src/mongo/db/server_options_general.idl index d086ebf9fa0..53e303f3aa9 100644 --- a/src/mongo/db/server_options_general.idl +++ b/src/mongo/db/server_options_general.idl @@ -37,13 +37,6 @@ global: initializer: register: addGeneralServerOptionDefinitions -server_parameters: - 'loadShedding': - description: 'Enable load shedding' - set_at: startup - cpp_vartype: bool - cpp_varname: gLoadShedding - configs: help: description: 'Show this usage information' diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp index a27eeb2b5b7..30fc736b576 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp @@ -171,8 +171,7 @@ Status wtRCToStatus_slow(int retCode, WT_SESSION* session, StringData prefix) { return false; }(); - const bool loadShedding = gLoadShedding; - if (loadShedding && reasonIsCachePressure) { + if (reasonIsCachePressure) { str::stream s; if (!prefix.empty()) s << prefix << " "; |