summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosef Ahmad <josef.ahmad@mongodb.com>2021-12-16 10:06:05 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-17 23:06:08 +0000
commit106a7ae87b88f1ef89141c9169074f38f67a24a7 (patch)
treee112835d048aef6bb3b120990dd92cec7350bdaf
parent87d16bbe9001824aa779a12d63b6ee7fe08b5830 (diff)
downloadmongo-106a7ae87b88f1ef89141c9169074f38f67a24a7.tar.gz
SERVER-61852 Make dbCheck try acquire the collection lock with backoff
(cherry picked from commit af390b7d689c98998c75215167554d819e1e94ae)
-rw-r--r--src/mongo/db/commands/dbcheck.cpp156
-rw-r--r--src/mongo/db/repl/dbcheck.idl38
2 files changed, 139 insertions, 55 deletions
diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp
index 44485d798b0..69787f2e61a 100644
--- a/src/mongo/db/commands/dbcheck.cpp
+++ b/src/mongo/db/commands/dbcheck.cpp
@@ -290,63 +290,109 @@ private:
// New OperationContext for each batch.
auto uniqueOpCtx = Client::getCurrent()->makeOperationContext();
auto opCtx = uniqueOpCtx.get();
- DbCheckOplogBatch batch;
-
- // Acquire collection lock in S mode.
- AutoGetCollection coll(opCtx, info.nss, MODE_S);
- const auto& collection = coll.getCollection();
- if (_stepdownHasOccurred(opCtx, info.nss)) {
- _done = true;
- return Status(ErrorCodes::PrimarySteppedDown, "dbCheck terminated due to stepdown");
- }
-
- if (!collection) {
- const auto msg = "Collection under dbCheck no longer exists";
- auto entry = dbCheckHealthLogEntry(info.nss,
- SeverityEnum::Info,
- "dbCheck failed",
- OplogEntriesEnum::Batch,
- BSON("success" << false << "error" << msg));
- HealthLog::get(opCtx).log(*entry);
- return {ErrorCodes::NamespaceNotFound, msg};
- }
-
- boost::optional<DbCheckHasher> hasher;
- try {
- hasher.emplace(opCtx,
- collection,
- first,
- info.end,
- std::min(batchDocs, info.maxCount),
- std::min(batchBytes, info.maxSize));
- } catch (const DBException& e) {
- return e.toStatus();
- }
-
- Status status = hasher->hashAll();
-
- if (!status.isOK()) {
- return status;
- }
-
- std::string md5 = hasher->total();
-
- batch.setType(OplogEntriesEnum::Batch);
- batch.setNss(info.nss);
- batch.setMd5(md5);
- batch.setMinKey(first);
- batch.setMaxKey(BSONKey(hasher->lastKey()));
BatchStats result;
-
- // Send information on this batch over the oplog.
- result.time = _logOp(opCtx, info.nss, collection->uuid(), batch.toBSON());
-
- result.nDocs = hasher->docsSeen();
- result.nBytes = hasher->bytesSeen();
- result.lastKey = hasher->lastKey();
- result.md5 = md5;
-
+ auto timeoutMs = Milliseconds(gDbCheckCollectionTryLockTimeoutMillis.load());
+ const auto initialBackoffMs =
+ Milliseconds(gDbCheckCollectionTryLockMinBackoffMillis.load());
+ auto backoffMs = initialBackoffMs;
+ for (int attempt = 1;; attempt++) {
+ try {
+ // Try to acquire collection lock in S mode with increasing timeout and bounded
+ // exponential backoff.
+ auto const lockDeadline = Date_t::now() + timeoutMs;
+ timeoutMs *= 2;
+
+ AutoGetCollection agc(opCtx,
+ info.nss,
+ MODE_S,
+ AutoGetCollection::ViewMode::kViewsForbidden,
+ lockDeadline);
+
+ if (_stepdownHasOccurred(opCtx, info.nss)) {
+ _done = true;
+ return Status(ErrorCodes::PrimarySteppedDown,
+ "dbCheck terminated due to stepdown");
+ }
+
+ const auto& collection =
+ CollectionCatalog::get(opCtx).lookupCollectionByNamespace(opCtx, info.nss);
+ if (!collection) {
+ const auto msg = "Collection under dbCheck no longer exists";
+ auto entry = dbCheckHealthLogEntry(info.nss,
+ SeverityEnum::Info,
+ "dbCheck failed",
+ OplogEntriesEnum::Batch,
+ BSON("success" << false << "error" << msg));
+ HealthLog::get(opCtx).log(*entry);
+ return {ErrorCodes::NamespaceNotFound, msg};
+ }
+
+ boost::optional<DbCheckHasher> hasher;
+ try {
+ hasher.emplace(opCtx,
+ collection,
+ first,
+ info.end,
+ std::min(batchDocs, info.maxCount),
+ std::min(batchBytes, info.maxSize));
+ } catch (const DBException& e) {
+ return e.toStatus();
+ }
+
+ Status status = hasher->hashAll();
+
+ if (!status.isOK()) {
+ return status;
+ }
+
+ std::string md5 = hasher->total();
+
+ DbCheckOplogBatch batch;
+ batch.setType(OplogEntriesEnum::Batch);
+ batch.setNss(info.nss);
+ batch.setMd5(md5);
+ batch.setMinKey(first);
+ batch.setMaxKey(BSONKey(hasher->lastKey()));
+
+ // Send information on this batch over the oplog.
+ result.time = _logOp(opCtx, info.nss, collection->uuid(), batch.toBSON());
+
+ result.nDocs = hasher->docsSeen();
+ result.nBytes = hasher->bytesSeen();
+ result.lastKey = hasher->lastKey();
+ result.md5 = md5;
+
+ break;
+ } catch (const ExceptionFor<ErrorCodes::LockTimeout>& e) {
+ if (attempt > gDbCheckCollectionTryLockMaxAttempts.load()) {
+ return StatusWith<BatchStats>(e.code(),
+ "Unable to acquire the collection lock");
+ }
+
+ // Bounded exponential backoff between tryLocks.
+ opCtx->sleepFor(backoffMs);
+ const auto maxBackoffMillis =
+ Milliseconds(gDbCheckCollectionTryLockMaxBackoffMillis.load());
+ if (backoffMs < maxBackoffMillis) {
+ auto backoff = durationCount<Milliseconds>(backoffMs);
+ auto initialBackoff = durationCount<Milliseconds>(initialBackoffMs);
+ backoff *= initialBackoff;
+ backoffMs = Milliseconds(backoff);
+ }
+ if (backoffMs > maxBackoffMillis) {
+ backoffMs = maxBackoffMillis;
+ }
+ LOGV2_DEBUG(6175700,
+ 1,
+ "Could not acquire collection lock, retrying",
+ "ns"_attr = info.nss.ns(),
+ "batchRangeMin"_attr = info.start.obj(),
+ "batchRangeMax"_attr = info.end.obj(),
+ "attempt"_attr = attempt,
+ "backoff"_attr = backoffMs);
+ }
+ }
return result;
}
diff --git a/src/mongo/db/repl/dbcheck.idl b/src/mongo/db/repl/dbcheck.idl
index f3de878b7bb..c074eda2112 100644
--- a/src/mongo/db/repl/dbcheck.idl
+++ b/src/mongo/db/repl/dbcheck.idl
@@ -36,6 +36,44 @@ global:
imports:
- "mongo/idl/basic_types.idl"
+server_parameters:
+ dbCheckCollectionTryLockTimeoutMillis:
+ description: 'Timeout to acquire the collection for processing a dbCheck batch. Each subsequent attempt doubles the timeout'
+ set_at: [ startup, runtime ]
+ cpp_vartype: 'AtomicWord<int>'
+ cpp_varname: gDbCheckCollectionTryLockTimeoutMillis
+ default: 10
+ validator:
+ gte: 1
+ lte: 10000
+ dbCheckCollectionTryLockMaxAttempts:
+ description: 'Maximum number of attempts with backoff to acquire the collection lock for processing a dbCheck batch'
+ set_at: [ startup, runtime ]
+ cpp_vartype: 'AtomicWord<int>'
+ cpp_varname: gDbCheckCollectionTryLockMaxAttempts
+ default: 5
+ validator:
+ gte: 1
+ lte: 20
+ dbCheckCollectionTryLockMinBackoffMillis:
+ description: 'Initial backoff on failure to acquire the collection lock for processing a dbCheck batch. Grows exponentially'
+ set_at: [ startup, runtime ]
+ cpp_vartype: 'AtomicWord<int>'
+ cpp_varname: gDbCheckCollectionTryLockMinBackoffMillis
+ default: 10
+ validator:
+ gte: 2
+ lte: 60000
+ dbCheckCollectionTryLockMaxBackoffMillis:
+ description: 'Maximum exponential backoff on failure to acquire the collection lock for processing a dbCheck batch.'
+ set_at: [ startup, runtime ]
+ cpp_vartype: 'AtomicWord<int>'
+ cpp_varname: gDbCheckCollectionTryLockMaxBackoffMillis
+ default: 60000
+ validator:
+ gte: 20
+ lte: 120000
+
types:
_id_key:
bson_serialization_type: any