diff options
author | Josef Ahmad <josef.ahmad@mongodb.com> | 2021-12-16 10:06:05 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-17 23:06:08 +0000 |
commit | 106a7ae87b88f1ef89141c9169074f38f67a24a7 (patch) | |
tree | e112835d048aef6bb3b120990dd92cec7350bdaf | |
parent | 87d16bbe9001824aa779a12d63b6ee7fe08b5830 (diff) | |
download | mongo-106a7ae87b88f1ef89141c9169074f38f67a24a7.tar.gz |
SERVER-61852 Make dbCheck try acquire the collection lock with backoff
(cherry picked from commit af390b7d689c98998c75215167554d819e1e94ae)
-rw-r--r-- | src/mongo/db/commands/dbcheck.cpp | 156 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.idl | 38 |
2 files changed, 139 insertions, 55 deletions
diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp index 44485d798b0..69787f2e61a 100644 --- a/src/mongo/db/commands/dbcheck.cpp +++ b/src/mongo/db/commands/dbcheck.cpp @@ -290,63 +290,109 @@ private: // New OperationContext for each batch. auto uniqueOpCtx = Client::getCurrent()->makeOperationContext(); auto opCtx = uniqueOpCtx.get(); - DbCheckOplogBatch batch; - - // Acquire collection lock in S mode. - AutoGetCollection coll(opCtx, info.nss, MODE_S); - const auto& collection = coll.getCollection(); - if (_stepdownHasOccurred(opCtx, info.nss)) { - _done = true; - return Status(ErrorCodes::PrimarySteppedDown, "dbCheck terminated due to stepdown"); - } - - if (!collection) { - const auto msg = "Collection under dbCheck no longer exists"; - auto entry = dbCheckHealthLogEntry(info.nss, - SeverityEnum::Info, - "dbCheck failed", - OplogEntriesEnum::Batch, - BSON("success" << false << "error" << msg)); - HealthLog::get(opCtx).log(*entry); - return {ErrorCodes::NamespaceNotFound, msg}; - } - - boost::optional<DbCheckHasher> hasher; - try { - hasher.emplace(opCtx, - collection, - first, - info.end, - std::min(batchDocs, info.maxCount), - std::min(batchBytes, info.maxSize)); - } catch (const DBException& e) { - return e.toStatus(); - } - - Status status = hasher->hashAll(); - - if (!status.isOK()) { - return status; - } - - std::string md5 = hasher->total(); - - batch.setType(OplogEntriesEnum::Batch); - batch.setNss(info.nss); - batch.setMd5(md5); - batch.setMinKey(first); - batch.setMaxKey(BSONKey(hasher->lastKey())); BatchStats result; - - // Send information on this batch over the oplog. - result.time = _logOp(opCtx, info.nss, collection->uuid(), batch.toBSON()); - - result.nDocs = hasher->docsSeen(); - result.nBytes = hasher->bytesSeen(); - result.lastKey = hasher->lastKey(); - result.md5 = md5; - + auto timeoutMs = Milliseconds(gDbCheckCollectionTryLockTimeoutMillis.load()); + const auto initialBackoffMs = + Milliseconds(gDbCheckCollectionTryLockMinBackoffMillis.load()); + auto backoffMs = initialBackoffMs; + for (int attempt = 1;; attempt++) { + try { + // Try to acquire collection lock in S mode with increasing timeout and bounded + // exponential backoff. + auto const lockDeadline = Date_t::now() + timeoutMs; + timeoutMs *= 2; + + AutoGetCollection agc(opCtx, + info.nss, + MODE_S, + AutoGetCollection::ViewMode::kViewsForbidden, + lockDeadline); + + if (_stepdownHasOccurred(opCtx, info.nss)) { + _done = true; + return Status(ErrorCodes::PrimarySteppedDown, + "dbCheck terminated due to stepdown"); + } + + const auto& collection = + CollectionCatalog::get(opCtx).lookupCollectionByNamespace(opCtx, info.nss); + if (!collection) { + const auto msg = "Collection under dbCheck no longer exists"; + auto entry = dbCheckHealthLogEntry(info.nss, + SeverityEnum::Info, + "dbCheck failed", + OplogEntriesEnum::Batch, + BSON("success" << false << "error" << msg)); + HealthLog::get(opCtx).log(*entry); + return {ErrorCodes::NamespaceNotFound, msg}; + } + + boost::optional<DbCheckHasher> hasher; + try { + hasher.emplace(opCtx, + collection, + first, + info.end, + std::min(batchDocs, info.maxCount), + std::min(batchBytes, info.maxSize)); + } catch (const DBException& e) { + return e.toStatus(); + } + + Status status = hasher->hashAll(); + + if (!status.isOK()) { + return status; + } + + std::string md5 = hasher->total(); + + DbCheckOplogBatch batch; + batch.setType(OplogEntriesEnum::Batch); + batch.setNss(info.nss); + batch.setMd5(md5); + batch.setMinKey(first); + batch.setMaxKey(BSONKey(hasher->lastKey())); + + // Send information on this batch over the oplog. + result.time = _logOp(opCtx, info.nss, collection->uuid(), batch.toBSON()); + + result.nDocs = hasher->docsSeen(); + result.nBytes = hasher->bytesSeen(); + result.lastKey = hasher->lastKey(); + result.md5 = md5; + + break; + } catch (const ExceptionFor<ErrorCodes::LockTimeout>& e) { + if (attempt > gDbCheckCollectionTryLockMaxAttempts.load()) { + return StatusWith<BatchStats>(e.code(), + "Unable to acquire the collection lock"); + } + + // Bounded exponential backoff between tryLocks. + opCtx->sleepFor(backoffMs); + const auto maxBackoffMillis = + Milliseconds(gDbCheckCollectionTryLockMaxBackoffMillis.load()); + if (backoffMs < maxBackoffMillis) { + auto backoff = durationCount<Milliseconds>(backoffMs); + auto initialBackoff = durationCount<Milliseconds>(initialBackoffMs); + backoff *= initialBackoff; + backoffMs = Milliseconds(backoff); + } + if (backoffMs > maxBackoffMillis) { + backoffMs = maxBackoffMillis; + } + LOGV2_DEBUG(6175700, + 1, + "Could not acquire collection lock, retrying", + "ns"_attr = info.nss.ns(), + "batchRangeMin"_attr = info.start.obj(), + "batchRangeMax"_attr = info.end.obj(), + "attempt"_attr = attempt, + "backoff"_attr = backoffMs); + } + } return result; } diff --git a/src/mongo/db/repl/dbcheck.idl b/src/mongo/db/repl/dbcheck.idl index f3de878b7bb..c074eda2112 100644 --- a/src/mongo/db/repl/dbcheck.idl +++ b/src/mongo/db/repl/dbcheck.idl @@ -36,6 +36,44 @@ global: imports: - "mongo/idl/basic_types.idl" +server_parameters: + dbCheckCollectionTryLockTimeoutMillis: + description: 'Timeout to acquire the collection for processing a dbCheck batch. Each subsequent attempt doubles the timeout' + set_at: [ startup, runtime ] + cpp_vartype: 'AtomicWord<int>' + cpp_varname: gDbCheckCollectionTryLockTimeoutMillis + default: 10 + validator: + gte: 1 + lte: 10000 + dbCheckCollectionTryLockMaxAttempts: + description: 'Maximum number of attempts with backoff to acquire the collection lock for processing a dbCheck batch' + set_at: [ startup, runtime ] + cpp_vartype: 'AtomicWord<int>' + cpp_varname: gDbCheckCollectionTryLockMaxAttempts + default: 5 + validator: + gte: 1 + lte: 20 + dbCheckCollectionTryLockMinBackoffMillis: + description: 'Initial backoff on failure to acquire the collection lock for processing a dbCheck batch. Grows exponentially' + set_at: [ startup, runtime ] + cpp_vartype: 'AtomicWord<int>' + cpp_varname: gDbCheckCollectionTryLockMinBackoffMillis + default: 10 + validator: + gte: 2 + lte: 60000 + dbCheckCollectionTryLockMaxBackoffMillis: + description: 'Maximum exponential backoff on failure to acquire the collection lock for processing a dbCheck batch.' + set_at: [ startup, runtime ] + cpp_vartype: 'AtomicWord<int>' + cpp_varname: gDbCheckCollectionTryLockMaxBackoffMillis + default: 60000 + validator: + gte: 20 + lte: 120000 + types: _id_key: bson_serialization_type: any |