diff options
author | Josef Ahmad <josef.ahmad@mongodb.com> | 2021-12-18 16:51:11 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-21 12:12:37 +0000 |
commit | b169a72cd2f2d36a2059af2f484cfcca248a7e1c (patch) | |
tree | e1ae93b504519152fd01947b2a019fb3b4f0e722 | |
parent | 8b3bdf4d0ceb494c0271a2ee0b92ec446f88e285 (diff) | |
download | mongo-b169a72cd2f2d36a2059af2f484cfcca248a7e1c.tar.gz |
SERVER-62022 Reduce dbCheck info logging in prod, log start/stop
(cherry picked from commit f328a35d74b1b98497d3b57db631d51e2c6b4e29)
-rw-r--r-- | jstests/noPassthrough/dbcheck_batch_deadline.js | 51 | ||||
-rw-r--r-- | jstests/replsets/dbcheck.js | 106 | ||||
-rw-r--r-- | src/mongo/db/catalog/health_log.idl | 9 | ||||
-rw-r--r-- | src/mongo/db/commands/dbcheck.cpp | 130 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.cpp | 47 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.idl | 21 |
7 files changed, 273 insertions, 106 deletions
diff --git a/jstests/noPassthrough/dbcheck_batch_deadline.js b/jstests/noPassthrough/dbcheck_batch_deadline.js index e4516a4f4d9..30432ab4a85 100644 --- a/jstests/noPassthrough/dbcheck_batch_deadline.js +++ b/jstests/noPassthrough/dbcheck_batch_deadline.js @@ -21,6 +21,8 @@ const db = primary.getDB('test'); const coll = db.c; const healthlog = primary.getDB('local').system.healthlog; +const debugBuild = db.adminCommand('buildInfo').debug; + // Populate collection. const collCount = 3; for (let i = 0; i < collCount; i++) { @@ -35,31 +37,36 @@ assert.commandWorked(db.runCommand({dbCheck: coll.getName()})); // Wait for dbCheck to complete and disable the failpoint. assert.soon(function() { - // Expecting one entry per document, plus a last (maxKey) entry. - const expectedHealthLogEntries = collCount + 1; - return (healthlog.find().itcount() == expectedHealthLogEntries); + return (healthlog.find({"operation": "dbCheckStop"}).itcount() == 1); }, "dbCheck command didn't complete - missing healthlog entries", 30 * 1000); fp.off(); -// Confirm each batch consists of 1 document, except for the last (maxKey) batch being empty. -assert.eq(collCount, - healthlog - .find({ - operation: "dbCheckBatch", - namespace: coll.getFullName(), - msg: "dbCheck batch consistent", - "data.count": 1 - }) - .itcount()); -assert.eq(1, - healthlog - .find({ - operation: "dbCheckBatch", - namespace: coll.getFullName(), - msg: "dbCheck batch consistent", - "data.count": 0 - }) - .itcount()); +if (debugBuild) { + // These tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. + + // Confirm each batch consists of 1 document, except for the last (maxKey) batch being empty. + assert.eq(collCount, + healthlog + .find({ + operation: "dbCheckBatch", + namespace: coll.getFullName(), + msg: "dbCheck batch consistent", + "data.count": 1 + }) + .itcount()); + assert.eq(1, + healthlog + .find({ + operation: "dbCheckBatch", + namespace: coll.getFullName(), + msg: "dbCheck batch consistent", + "data.count": 0 + }) + .itcount()); +} + +assert.eq(0, healthlog.find({"severity": {$ne: "info"}}).itcount()); replTest.stopSet(); })(); diff --git a/jstests/replsets/dbcheck.js b/jstests/replsets/dbcheck.js index 419904f2e97..b431601f4da 100644 --- a/jstests/replsets/dbcheck.js +++ b/jstests/replsets/dbcheck.js @@ -36,6 +36,8 @@ function forEachNode(f) { let dbName = "dbCheck-test"; let collName = "dbcheck-collection"; +const debugBuild = replSet.getPrimary().getDB('admin').adminCommand('buildInfo').debug; + // Clear local.system.healthlog. function clearLog() { forEachNode(conn => conn.getDB("local").system.healthlog.drop()); @@ -89,10 +91,10 @@ function awaitDbCheckCompletion(db, collName, maxKey, maxSize, maxCount) { replSet.awaitReplication(); forEachNode(function(node) { - const nodeDB = node.getDB(db); - const nodeColl = node.getDB(db)[collName]; - assert.soon(() => dbCheckHealthLogCompleted(nodeDB, nodeColl, maxKey, maxSize, maxCount), - "dbCheck wait for health log timed out"); + const healthlog = node.getDB('local').system.healthlog; + assert.soon(function() { + return (healthlog.find({"operation": "dbCheckStop"}).itcount() == 1); + }, "dbCheck command didn't complete"); }); } @@ -101,20 +103,27 @@ function awaitDbCheckCompletion(db, collName, maxKey, maxSize, maxCount) { function checkLogAllConsistent(conn) { let healthlog = conn.getDB("local").system.healthlog; - assert(healthlog.find().count(), "dbCheck put no batches in health log"); - - let maxResult = healthlog.aggregate( - [{$match: {operation: "dbCheckBatch"}}, {$group: {_id: 1, key: {$max: "$data.maxKey"}}}]); + if (debugBuild) { + // These tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. + assert(healthlog.find().count(), "dbCheck put no batches in health log"); - assert(maxResult.hasNext(), "dbCheck put no batches in health log"); - assert.eq(maxResult.next().key, {"$maxKey": 1}, "dbCheck batches should end at MaxKey"); + let maxResult = healthlog.aggregate([ + {$match: {operation: "dbCheckBatch"}}, + {$group: {_id: 1, key: {$max: "$data.maxKey"}}} + ]); - let minResult = healthlog.aggregate( - [{$match: {operation: "dbCheckBatch"}}, {$group: {_id: 1, key: {$min: "$data.minKey"}}}]); + assert(maxResult.hasNext(), "dbCheck put no batches in health log"); + assert.eq(maxResult.next().key, {"$maxKey": 1}, "dbCheck batches should end at MaxKey"); - assert(minResult.hasNext(), "dbCheck put no batches in health log"); - assert.eq(minResult.next().key, {"$minKey": 1}, "dbCheck batches should start at MinKey"); + let minResult = healthlog.aggregate([ + {$match: {operation: "dbCheckBatch"}}, + {$group: {_id: 1, key: {$min: "$data.minKey"}}} + ]); + assert(minResult.hasNext(), "dbCheck put no batches in health log"); + assert.eq(minResult.next().key, {"$minKey": 1}, "dbCheck batches should start at MinKey"); + } // Assert no errors (i.e., found inconsistencies). let errs = healthlog.find({"severity": {"$ne": "info"}}); if (errs.hasNext()) { @@ -127,25 +136,29 @@ function checkLogAllConsistent(conn) { assert(false, "dbCheck batch failed: " + tojson(failedChecks.next())); } - // Finds an entry with data.minKey === MinKey, and then matches its maxKey against - // another document's minKey, and so on, and then checks that the result of that search - // has data.maxKey === MaxKey. - let completeCoverage = healthlog.aggregate([ - {$match: {"operation": "dbCheckBatch", "data.minKey": MinKey}}, - { - $graphLookup: { - from: "system.healthlog", - startWith: "$data.minKey", - connectToField: "data.minKey", - connectFromField: "data.maxKey", - as: "batchLimits", - restrictSearchWithMatch: {"operation": "dbCheckBatch"} - } - }, - {$match: {"batchLimits.data.maxKey": MaxKey}} - ]); + if (debugBuild) { + // These tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. - assert(completeCoverage.hasNext(), "dbCheck batches do not cover full key range"); + // Finds an entry with data.minKey === MinKey, and then matches its maxKey against + // another document's minKey, and so on, and then checks that the result of that search + // has data.maxKey === MaxKey. + let completeCoverage = healthlog.aggregate([ + {$match: {"operation": "dbCheckBatch", "data.minKey": MinKey}}, + { + $graphLookup: { + from: "system.healthlog", + startWith: "$data.minKey", + connectToField: "data.minKey", + connectFromField: "data.maxKey", + as: "batchLimits", + restrictSearchWithMatch: {"operation": "dbCheckBatch"} + } + }, + {$match: {"batchLimits.data.maxKey": MaxKey}} + ]); + assert(completeCoverage.hasNext(), "dbCheck batches do not cover full key range"); + } } // Check that the total of all batches in the health log on `conn` is equal to the total number @@ -154,6 +167,11 @@ function checkLogAllConsistent(conn) { // Returns a document with fields "totalDocs" and "totalBytes", representing the total size of // the batches in the health log. function healthLogCounts(healthlog) { + // These tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. + if (!debugBuild) { + return; + } let result = healthlog.aggregate([ {$match: {"operation": "dbCheckBatch"}}, { @@ -171,6 +189,11 @@ function healthLogCounts(healthlog) { } function checkTotalCounts(conn, coll) { + // These tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. + if (!debugBuild) { + return; + } let result = healthLogCounts(conn.getDB("local").system.healthlog); assert.eq(result.totalDocs, coll.count(), "dbCheck batches do not count all documents"); @@ -224,6 +247,7 @@ function simpleTestNonSnapshot() { // Same thing, but now with concurrent updates. function concurrentTestConsistent() { let primary = replSet.getPrimary(); + clearLog(); let db = primary.getDB(dbName); @@ -263,6 +287,11 @@ function testDbCheckParameters() { let docSize = bsonsize({_id: 10}); function checkEntryBounds(start, end) { + // These tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. + if (!debugBuild) { + return; + } forEachNode(function(node) { let healthlog = node.getDB("local").system.healthlog; let keyBoundsResult = healthlog.aggregate([ @@ -317,6 +346,13 @@ function testDbCheckParameters() { awaitDbCheckCompletion(db, multiBatchSimpleCollName, end, maxSize); checkEntryBounds(start, start + maxCount); + // The remaining tests only run on debug builds because they rely on dbCheck health-logging + // all info-level batch results. + + if (!debugBuild) { + return; + } + const healthlog = db.getSiblingDB('local').system.healthlog; { // Validate custom maxDocsPerBatch @@ -476,6 +512,12 @@ function simpleTestCatchesExtra() { assert.commandWorked(db.runCommand({dbCheck: collName})); awaitDbCheckCompletion(db, collName); } + assert.soon(function() { + return (replSet.getSecondary() + .getDB("local") + .system.healthlog.find({"operation": "dbCheckStop"}) + .itcount() === 1); + }, "dbCheck didn't complete on secondary"); const errors = replSet.getSecondary().getDB("local").system.healthlog.find( {operation: /dbCheck.*/, severity: "error"}); diff --git a/src/mongo/db/catalog/health_log.idl b/src/mongo/db/catalog/health_log.idl index a0f1114461d..5ce048473f1 100644 --- a/src/mongo/db/catalog/health_log.idl +++ b/src/mongo/db/catalog/health_log.idl @@ -64,11 +64,14 @@ structs: description: "An entry in system.local.healthlog." fields: namespace: - cpp_name: nss - type: namespacestring + cpp_name: nss + type: namespacestring + optional: true timestamp: date severity: Severity msg: string scope: Scope operation: string - data: object + data: + type: object + optional: true diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp index 39c6a9f9a37..8c680e23c8a 100644 --- a/src/mongo/db/commands/dbcheck.cpp +++ b/src/mongo/db/commands/dbcheck.cpp @@ -55,6 +55,79 @@ namespace mongo { namespace { +repl::OpTime _logOp(OperationContext* opCtx, + const NamespaceString& nss, + const boost::optional<UUID>& uuid, + const BSONObj& obj) { + repl::MutableOplogEntry oplogEntry; + oplogEntry.setOpType(repl::OpTypeEnum::kCommand); + oplogEntry.setNss(nss); + if (uuid) { + oplogEntry.setUuid(*uuid); + } + oplogEntry.setObject(obj); + AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite); + return writeConflictRetry( + opCtx, "dbCheck oplog entry", NamespaceString::kRsOplogNamespace.ns(), [&] { + auto const clockSource = opCtx->getServiceContext()->getFastClockSource(); + oplogEntry.setWallClockTime(clockSource->now()); + + WriteUnitOfWork uow(opCtx); + repl::OpTime result = repl::logOp(opCtx, &oplogEntry); + uow.commit(); + return result; + }); +} + +/** + * RAII-style class, which logs dbCheck start and stop events in the healthlog and replicates them. + */ +class DbCheckStartAndStopLogger { +public: + DbCheckStartAndStopLogger(OperationContext* opCtx) : _opCtx(opCtx) { + try { + const auto healthLogEntry = dbCheckHealthLogEntry(boost::none /*nss*/, + SeverityEnum::Info, + "", + OplogEntriesEnum::Start, + boost::none /*data*/ + ); + HealthLog::get(_opCtx->getServiceContext()).log(*healthLogEntry); + + DbCheckOplogStartStop oplogEntry; + const auto nss = NamespaceString("admin.$cmd"); + oplogEntry.setNss(nss); + oplogEntry.setType(OplogEntriesEnum::Start); + _logOp(_opCtx, nss, boost::none /*uuid*/, oplogEntry.toBSON()); + } catch (const DBException&) { + LOGV2(6202200, "Could not log start event"); + } + } + + ~DbCheckStartAndStopLogger() { + try { + DbCheckOplogStartStop oplogEntry; + const auto nss = NamespaceString("admin.$cmd"); + oplogEntry.setNss(nss); + oplogEntry.setType(OplogEntriesEnum::Stop); + _logOp(_opCtx, nss, boost::none /*uuid*/, oplogEntry.toBSON()); + + const auto healthLogEntry = dbCheckHealthLogEntry(boost::none /*nss*/, + SeverityEnum::Info, + "", + OplogEntriesEnum::Stop, + boost::none /*data*/ + ); + HealthLog::get(_opCtx->getServiceContext()).log(*healthLogEntry); + } catch (const DBException&) { + LOGV2(6202201, "Could not log stop event"); + } + } + +private: + OperationContext* _opCtx; +}; + /** * All the information needed to run dbCheck on a single collection. */ @@ -205,6 +278,8 @@ protected: auto uniqueOpCtx = tc->makeOperationContext(); auto opCtx = uniqueOpCtx.get(); + DbCheckStartAndStopLogger startStop(opCtx); + for (const auto& coll : *_run) { try { _doCollection(opCtx, coll); @@ -271,9 +346,10 @@ private: return; } + std::unique_ptr<HealthLogEntry> entry; + if (!result.isOK()) { bool retryable = false; - std::unique_ptr<HealthLogEntry> entry; const auto code = result.getStatus().code(); if (code == ErrorCodes::LockTimeout) { @@ -319,19 +395,27 @@ private: continue; } return; + } else { + _batchesProcessed++; + auto stats = result.getValue(); + entry = dbCheckBatchEntry(info.nss, + stats.nDocs, + stats.nBytes, + stats.md5, + stats.md5, + start, + stats.lastKey, + stats.readTimestamp, + stats.time); + if (kDebugBuild || entry->getSeverity() != SeverityEnum::Info || + (_batchesProcessed % gDbCheckHealthLogEveryNBatches.load() == 0)) { + // On debug builds, health-log every batch result; on release builds, health-log + // every N batches. + HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry); + } } auto stats = result.getValue(); - auto entry = dbCheckBatchEntry(info.nss, - stats.nDocs, - stats.nBytes, - stats.md5, - stats.md5, - start, - stats.lastKey, - stats.readTimestamp, - stats.time); - HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry); start = stats.lastKey; @@ -522,27 +606,9 @@ private: return false; } - repl::OpTime _logOp(OperationContext* opCtx, - const NamespaceString& nss, - const UUID& uuid, - const BSONObj& obj) { - repl::MutableOplogEntry oplogEntry; - oplogEntry.setOpType(repl::OpTypeEnum::kCommand); - oplogEntry.setNss(nss); - oplogEntry.setUuid(uuid); - oplogEntry.setObject(obj); - AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite); - return writeConflictRetry( - opCtx, "dbCheck oplog entry", NamespaceString::kRsOplogNamespace.ns(), [&] { - auto const clockSource = opCtx->getServiceContext()->getFastClockSource(); - oplogEntry.setWallClockTime(clockSource->now()); - - WriteUnitOfWork uow(opCtx); - repl::OpTime result = repl::logOp(opCtx, &oplogEntry); - uow.commit(); - return result; - }); - } + // Cumulative number of batches processed. Can wrap around; it's not guaranteed to be in + // lockstep with other replica set members. + unsigned int _batchesProcessed = 0; }; /** diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp index 48dec58214a..b90477059e5 100644 --- a/src/mongo/db/repl/dbcheck.cpp +++ b/src/mongo/db/repl/dbcheck.cpp @@ -110,6 +110,10 @@ std::string renderForHealthLog(OplogEntriesEnum op) { return "dbCheckBatch"; case OplogEntriesEnum::Collection: return "dbCheckCollection"; + case OplogEntriesEnum::Start: + return "dbCheckStart"; + case OplogEntriesEnum::Stop: + return "dbCheckStop"; } MONGO_UNREACHABLE; @@ -119,30 +123,35 @@ std::string renderForHealthLog(OplogEntriesEnum op) { /** * Fills in the timestamp and scope, which are always the same for dbCheck's entries. */ -std::unique_ptr<HealthLogEntry> dbCheckHealthLogEntry(const NamespaceString& nss, +std::unique_ptr<HealthLogEntry> dbCheckHealthLogEntry(const boost::optional<NamespaceString>& nss, SeverityEnum severity, const std::string& msg, OplogEntriesEnum operation, - const BSONObj& data) { + const boost::optional<BSONObj>& data) { auto entry = std::make_unique<HealthLogEntry>(); - entry->setNss(nss); + if (nss) { + entry->setNss(*nss); + } entry->setTimestamp(Date_t::now()); entry->setSeverity(severity); entry->setScope(ScopeEnum::Cluster); entry->setMsg(msg); entry->setOperation(renderForHealthLog(operation)); - entry->setData(data); + if (data) { + entry->setData(*data); + } return entry; } /** * Get an error message if the check fails. */ -std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry(const NamespaceString& nss, - const std::string& msg, - OplogEntriesEnum operation, - const Status& err, - const BSONObj& context) { +std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry( + const boost::optional<NamespaceString>& nss, + const std::string& msg, + OplogEntriesEnum operation, + const Status& err, + const BSONObj& context) { return dbCheckHealthLogEntry( nss, SeverityEnum::Error, @@ -329,6 +338,10 @@ bool DbCheckHasher::_canHash(const BSONObj& obj) { namespace { +// Cumulative number of batches processed. Can wrap around; it's not guaranteed to be in lockstep +// with other replica set members. +unsigned int batchesProcessed = 0; + Status dbCheckBatchOnSecondary(OperationContext* opCtx, const repl::OpTime& optime, const DbCheckOplogBatch& entry) { @@ -375,7 +388,13 @@ Status dbCheckBatchOnSecondary(OperationContext* opCtx, optime, collection->getCollectionOptions()); - HealthLog::get(opCtx).log(*logEntry); + batchesProcessed++; + if (kDebugBuild || logEntry->getSeverity() != SeverityEnum::Info || + (batchesProcessed % gDbCheckHealthLogEveryNBatches.load() == 0)) { + // On debug builds, health-log every batch result; on release builds, health-log + // every N batches. + HealthLog::get(opCtx).log(*logEntry); + } } catch (const DBException& exception) { // In case of an error, report it to the health log, auto logEntry = dbCheckErrorHealthLogEntry( @@ -414,6 +433,14 @@ Status dbCheckOplogCommand(OperationContext* opCtx, // TODO SERVER-61963. return Status::OK(); } + case OplogEntriesEnum::Start: + // fallthrough + case OplogEntriesEnum::Stop: + const auto entry = mongo::dbCheckHealthLogEntry( + boost::none /*nss*/, SeverityEnum::Info, "", type, boost::none /*data*/ + ); + HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry); + return Status::OK(); } MONGO_UNREACHABLE; diff --git a/src/mongo/db/repl/dbcheck.h b/src/mongo/db/repl/dbcheck.h index 73c9bef29f1..0c9368b9ad5 100644 --- a/src/mongo/db/repl/dbcheck.h +++ b/src/mongo/db/repl/dbcheck.h @@ -51,20 +51,21 @@ class OpTime; /** * Logs an entry into 'local.system.healthLog'. */ -std::unique_ptr<HealthLogEntry> dbCheckHealthLogEntry(const NamespaceString& nss, +std::unique_ptr<HealthLogEntry> dbCheckHealthLogEntry(const boost::optional<NamespaceString>& nss, SeverityEnum severity, const std::string& msg, OplogEntriesEnum operation, - const BSONObj& data); + const boost::optional<BSONObj>& data); /** * Logs an error into 'local.system.healthLog'. */ -std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry(const NamespaceString& nss, - const std::string& msg, - OplogEntriesEnum operation, - const Status& err, - const BSONObj& context = BSONObj()); +std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry( + const boost::optional<NamespaceString>& nss, + const std::string& msg, + OplogEntriesEnum operation, + const Status& err, + const BSONObj& context = BSONObj()); std::unique_ptr<HealthLogEntry> dbCheckWarningHealthLogEntry(const NamespaceString& nss, const std::string& msg, diff --git a/src/mongo/db/repl/dbcheck.idl b/src/mongo/db/repl/dbcheck.idl index c1e7fedfacc..48e71f32c31 100644 --- a/src/mongo/db/repl/dbcheck.idl +++ b/src/mongo/db/repl/dbcheck.idl @@ -73,6 +73,15 @@ server_parameters: validator: gte: 20 lte: 120000 + dbCheckHealthLogEveryNBatches: + description: 'Emit an info-severity health log batch every N batches processed' + set_at: [ startup, runtime ] + cpp_vartype: 'AtomicWord<int>' + cpp_varname: gDbCheckHealthLogEveryNBatches + default: 25 + validator: + gte: 1 + lte: 10000 types: _id_key: @@ -89,6 +98,8 @@ enums: values: Batch: "batch" Collection: "collection" + Start: "start" + Stop: "stop" structs: DbCheckSingleInvocation: @@ -216,3 +227,13 @@ structs: options: type: object cpp_name: options + + DbCheckOplogStartStop: + description: "Oplog entry for dbCheck start and stop" + fields: + dbCheck: + type: namespacestring + cpp_name: nss + type: + type: OplogEntries + cpp_name: type |