summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2022-03-11 10:01:57 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-03-11 10:33:12 +0000
commit8e616cc3ef082ec67a04393ca3435d4b81f44da4 (patch)
tree83c571700a891dae5a76db6f43f0d0f7d932b372
parent0492ea43ae52b6102e6637b0cd745a2901e6bf06 (diff)
downloadmongo-8e616cc3ef082ec67a04393ca3435d4b81f44da4.tar.gz
SERVER-64050 Increase TemporarilyUnavailable retry backoff and enable by default
* Alters the base backoff period to 1 second and max retries to 10. * Removes the loadShedding server parameter and enables internal TemporarilyUnavailable retry behavior by default. * TemporarilyUnavailable errors are converted to WriteConflicts when returned inside multi-document transactions.
-rw-r--r--jstests/noPassthrough/temporarily_unavailable_error.js98
-rw-r--r--src/mongo/db/concurrency/temporarily_unavailable_exception.cpp20
-rw-r--r--src/mongo/db/concurrency/temporarily_unavailable_exception.h9
-rw-r--r--src/mongo/db/concurrency/write_conflict_exception.h22
-rw-r--r--src/mongo/db/concurrency/write_conflict_exception.idl18
-rw-r--r--src/mongo/db/server_options_general.idl7
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp3
7 files changed, 129 insertions, 48 deletions
diff --git a/jstests/noPassthrough/temporarily_unavailable_error.js b/jstests/noPassthrough/temporarily_unavailable_error.js
index 253c08cead3..d70841733c8 100644
--- a/jstests/noPassthrough/temporarily_unavailable_error.js
+++ b/jstests/noPassthrough/temporarily_unavailable_error.js
@@ -3,20 +3,31 @@
* with TemporarilyUnavailable error.
*
* @tags: [
- * assumes_against_mongod_not_mongos,
- * does_not_support_stepdowns,
* // Exclude in-memory engine, rollbacks due to pinned cache content rely on eviction.
* requires_journaling,
* requires_persistence,
+ * requires_replication,
* requires_wiredtiger,
* ]
*/
(function() {
"use strict";
-const mongod =
- MongoRunner.runMongod({wiredTigerCacheSizeGB: 0.256, setParameter: "loadShedding=1"});
-const db = mongod.getDB("test");
+const replSet = new ReplSetTest({
+ nodes: 1,
+ nodeOptions: {
+ wiredTigerCacheSizeGB: 0.256,
+ setParameter: {
+ logComponentVerbosity: tojson({control: 1, write: 1}),
+ // Lower these values from the defaults to speed up the test.
+ temporarilyUnavailableMaxRetries: 3,
+ temporarilyUnavailableBackoffBaseMs: 10,
+ }
+ },
+});
+replSet.startSet();
+replSet.initiate();
+const db = replSet.getPrimary().getDB("test");
// Generate a workload pinning enough dirty data in cache that causes WiredTiger
// to roll back transactions. This workload is adapted from the reproducer in the
@@ -26,26 +37,63 @@ let doc = {x: []};
for (let j = 0; j < 50000; j++)
doc.x.push("" + Math.random() + Math.random());
-let caughtTUerror = false;
-let attempts;
-for (attempts = 1; attempts <= 20; attempts++) {
- const ret = db.c.insert(doc);
+(function temporarilyUnavailableNonTransaction() {
+ let caughtTUerror = false;
+ let attempts;
+ for (attempts = 1; attempts <= 20; attempts++) {
+ print("temporarilyUnavailableNonTransaction attempt " + attempts);
+ const ret = db.c.insert(doc);
- if (ret["nInserted"] === 1) {
- // The write succeeded.
- continue;
+ if (ret["nInserted"] === 1) {
+ // The write succeeded.
+ continue;
+ }
+ assert.eq(0, ret["nInserted"]);
+ assert.commandFailedWithCode(ret, ErrorCodes.TemporarilyUnavailable);
+ caughtTUerror = true;
+ jsTestLog("returned the expected TemporarilyUnavailable code at attempt " + attempts);
+ break;
}
- assert.eq(0, ret["nInserted"]);
- assert.commandFailedWithCode(ret, ErrorCodes.TemporarilyUnavailable);
- caughtTUerror = true;
- jsTestLog("returned the expected TemporarilyUnavailable code at attempt " + attempts);
- break;
-}
-
-assert.eq(true,
- caughtTUerror,
- "did not return the expected TemporarilyUnavailable error after " + (attempts - 1) +
- " attempts");
-
-MongoRunner.stopMongod(mongod);
+
+ assert(caughtTUerror,
+ "did not return the expected TemporarilyUnavailable error after " + (attempts - 1) +
+ " attempts");
+})();
+
+(function temporarilyUnavailableInTransactionIsConvertedToWriteConflict() {
+ // Inside a transaction, TemporarilyUnavailable errors should be converted to
+ // WriteConflicts and tagged as TransientTransactionErrors.
+ let caughtWriteConflict = false;
+ let attempts;
+ let ret;
+ for (attempts = 1; attempts <= 20; attempts++) {
+ print("temporarilyUnavailableInTransactionIsConvertedToWriteConflict attempt " + attempts);
+ const session = db.getMongo().startSession();
+ session.startTransaction();
+ const sessionDB = session.getDatabase('test');
+ ret = sessionDB.c.insert(doc);
+
+ if (ret["nInserted"] === 1) {
+ // The write succeeded.
+ session.commitTransaction();
+ continue;
+ }
+ assert.commandFailedWithCode(
+ ret, ErrorCodes.WriteConflict, "Did not get WriteConflict. Result: " + tojson(ret));
+ assert(ret.hasOwnProperty("errorLabels"), "missing errorLabels. Result: " + tojson(ret));
+ assert.contains("TransientTransactionError",
+ ret.errorLabels,
+ "did not find TransientTransaction error label. Result: " + tojson(ret));
+ caughtWriteConflict = true;
+ jsTestLog("returned the expected WriteConflict code at attempt " + attempts);
+ session.abortTransaction();
+ break;
+ }
+
+ assert(caughtWriteConflict,
+ "did not return the expected WriteConflict error after " + (attempts - 1) +
+ " attempts. Result: " + tojson(ret));
+})();
+
+replSet.stopSet();
})();
diff --git a/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp b/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp
index 29543db0f30..98131fc3140 100644
--- a/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp
+++ b/src/mongo/db/concurrency/temporarily_unavailable_exception.cpp
@@ -37,10 +37,12 @@
namespace mongo {
+// These are initialized by IDL as server parameters.
+AtomicWord<long long> TemporarilyUnavailableException::maxRetryAttempts;
+AtomicWord<long long> TemporarilyUnavailableException::retryBackoffBaseMs;
+
TemporarilyUnavailableException::TemporarilyUnavailableException(StringData context)
- : DBException(Status(ErrorCodes::TemporarilyUnavailable, context)) {
- invariant(gLoadShedding);
-}
+ : DBException(Status(ErrorCodes::TemporarilyUnavailable, context)) {}
void TemporarilyUnavailableException::handle(OperationContext* opCtx,
int attempts,
@@ -49,23 +51,29 @@ void TemporarilyUnavailableException::handle(OperationContext* opCtx,
const TemporarilyUnavailableException& e) {
opCtx->recoveryUnit()->abandonSnapshot();
if (opCtx->getClient()->isFromUserConnection() &&
- attempts > TemporarilyUnavailableException::kMaxRetryAttempts) {
+ attempts > TemporarilyUnavailableException::maxRetryAttempts.load()) {
LOGV2_DEBUG(6083901,
1,
"Too many TemporarilyUnavailableException's, giving up",
+ "reason"_attr = e.reason(),
"attempts"_attr = attempts,
"operation"_attr = opStr,
logAttrs(NamespaceString(ns)));
throw e;
}
+
+ // Back off linearly with the retry attempt number.
+ auto sleepFor =
+ Milliseconds(TemporarilyUnavailableException::retryBackoffBaseMs.load()) * attempts;
LOGV2_DEBUG(6083900,
1,
"Caught TemporarilyUnavailableException",
+ "reason"_attr = e.reason(),
"attempts"_attr = attempts,
"operation"_attr = opStr,
+ "sleepFor"_attr = sleepFor,
logAttrs(NamespaceString(ns)));
- // Back off linearly with the retry attempt number.
- opCtx->sleepFor(TemporarilyUnavailableException::kRetryBackoff * attempts);
+ opCtx->sleepFor(sleepFor);
}
} // namespace mongo
diff --git a/src/mongo/db/concurrency/temporarily_unavailable_exception.h b/src/mongo/db/concurrency/temporarily_unavailable_exception.h
index 20702fe6d79..ecd12c46943 100644
--- a/src/mongo/db/concurrency/temporarily_unavailable_exception.h
+++ b/src/mongo/db/concurrency/temporarily_unavailable_exception.h
@@ -38,13 +38,14 @@ namespace mongo {
/**
* This exception is thrown if an operation aborts due to the server being temporarily
- * unavailable, e.g. due to excessive load. May only be used when loadShedding is
- * enabled.
+ * unavailable, e.g. due to excessive load. For user-originating operations, this will be retried
+ * internally by the writeConflictRetry helper a finite number of times before eventually being
+ * returned.
*/
class TemporarilyUnavailableException final : public DBException {
public:
- static constexpr int kMaxRetryAttempts = 3;
- static constexpr auto kRetryBackoff = Milliseconds(100);
+ static AtomicWord<long long> maxRetryAttempts;
+ static AtomicWord<long long> retryBackoffBaseMs;
TemporarilyUnavailableException(StringData context);
diff --git a/src/mongo/db/concurrency/write_conflict_exception.h b/src/mongo/db/concurrency/write_conflict_exception.h
index 4513b008866..745f63e8e64 100644
--- a/src/mongo/db/concurrency/write_conflict_exception.h
+++ b/src/mongo/db/concurrency/write_conflict_exception.h
@@ -71,9 +71,12 @@ private:
/**
* Runs the argument function f as many times as needed for f to complete or throw an exception
- * other than WriteConflictException. For each time f throws a WriteConflictException, logs the
- * error, waits a spell, cleans up, and then tries f again. Imposes no upper limit on the number
- * of times to re-try f, so any required timeout behavior must be enforced within f.
+ * other than WriteConflictException or TemporarilyUnavailableException. For each time f throws an
+ * one of these exceptions, logs the error, waits a spell, cleans up, and then tries f again.
+ * Imposes no upper limit on the number of times to re-try f after a WriteConflictException, so any
+ * required timeout behavior must be enforced within f. When retrying a
+ * TemporarilyUnavailableException, f is called a finite number of times before we eventually let
+ * the error escape.
*
* If we are already in a WriteUnitOfWork, we assume that we are being called within a
* WriteConflictException retry loop up the call stack. Hence, this retry loop is reduced to an
@@ -91,7 +94,18 @@ auto writeConflictRetry(OperationContext* opCtx, StringData opStr, StringData ns
bool userSkipWriteConflictRetry = MONGO_unlikely(skipWriteConflictRetries.shouldFail()) &&
opCtx->getClient()->isFromUserConnection();
if (opCtx->lockState()->inAWriteUnitOfWork() || userSkipWriteConflictRetry) {
- return f();
+ try {
+ return f();
+ } catch (TemporarilyUnavailableException const& e) {
+ if (opCtx->inMultiDocumentTransaction()) {
+ // Since WriteConflicts are tagged as TransientTransactionErrors and
+ // TemporarilyUnavailable errors are not, we convert the error to a WriteConflict to
+ // allow users of multi-document transactions to retry without changing any
+ // behavior. Otherwise, we let the error escape as usual.
+ throw WriteConflictException(e.reason());
+ }
+ throw;
+ }
}
int attempts = 0;
diff --git a/src/mongo/db/concurrency/write_conflict_exception.idl b/src/mongo/db/concurrency/write_conflict_exception.idl
index 0f80fa37d61..c1ec8cb57e5 100644
--- a/src/mongo/db/concurrency/write_conflict_exception.idl
+++ b/src/mongo/db/concurrency/write_conflict_exception.idl
@@ -35,3 +35,21 @@ server_parameters:
description: 'Call printStackTrace on every WriteConflictException created'
set_at: [ startup, runtime ]
cpp_varname: 'WriteConflictException::trace'
+
+ temporarilyUnavailableMaxRetries:
+ description: 'The number of times to retry a TemporarilyUnavailable error internally'
+ set_at: [ startup, runtime ]
+ cpp_varname: 'TemporarilyUnavailableException::maxRetryAttempts'
+ default: 10
+ validator:
+ gte: 0
+
+ temporarilyUnavailableBackoffBaseMs:
+ description: 'The base period of time to wait between each TemporarilyUnavailable retry
+ attempt. The backoff time is linear such that the Nth retry waits for N times
+ the base backoff period.'
+ set_at: [ startup, runtime ]
+ cpp_varname: 'TemporarilyUnavailableException::retryBackoffBaseMs'
+ default: 1000
+ validator:
+ gte: 0
diff --git a/src/mongo/db/server_options_general.idl b/src/mongo/db/server_options_general.idl
index d086ebf9fa0..53e303f3aa9 100644
--- a/src/mongo/db/server_options_general.idl
+++ b/src/mongo/db/server_options_general.idl
@@ -37,13 +37,6 @@ global:
initializer:
register: addGeneralServerOptionDefinitions
-server_parameters:
- 'loadShedding':
- description: 'Enable load shedding'
- set_at: startup
- cpp_vartype: bool
- cpp_varname: gLoadShedding
-
configs:
help:
description: 'Show this usage information'
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp
index a27eeb2b5b7..30fc736b576 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp
@@ -171,8 +171,7 @@ Status wtRCToStatus_slow(int retCode, WT_SESSION* session, StringData prefix) {
return false;
}();
- const bool loadShedding = gLoadShedding;
- if (loadShedding && reasonIsCachePressure) {
+ if (reasonIsCachePressure) {
str::stream s;
if (!prefix.empty())
s << prefix << " ";