diff options
author | Josef Ahmad <josef.ahmad@mongodb.com> | 2021-12-15 09:12:12 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-18 10:12:13 +0000 |
commit | e4951d4deaa591bc82460e4de606900bed3702ca (patch) | |
tree | 727c58c92addc1b104846c000ffe6a411ea7c3fb | |
parent | fcd42d15ec4b00f9ce6b358894147b4f330035c1 (diff) | |
download | mongo-e4951d4deaa591bc82460e4de606900bed3702ca.tar.gz |
SERVER-62041 Add a maximum batch execution time to dbCheck
(cherry picked from commit 18bf3b5fcf4e06fde48e3459d4020ce3db3fa29f)
-rw-r--r-- | jstests/noPassthrough/dbcheck_batch_deadline.js | 65 | ||||
-rw-r--r-- | src/mongo/db/commands/dbcheck.cpp | 23 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.h | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.idl | 12 |
5 files changed, 108 insertions, 12 deletions
diff --git a/jstests/noPassthrough/dbcheck_batch_deadline.js b/jstests/noPassthrough/dbcheck_batch_deadline.js new file mode 100644 index 00000000000..e4516a4f4d9 --- /dev/null +++ b/jstests/noPassthrough/dbcheck_batch_deadline.js @@ -0,0 +1,65 @@ +/** + * Confirms that dbCheck stops processing a batch when reaching the deadline, and that + * the following batch resumes from where the previous one left off. + * + * @tags: [ + * requires_journaling, + * requires_replication, + * ] + */ + +(function() { +"use strict"; +load("jstests/libs/fail_point_util.js"); + +const replTest = new ReplSetTest({name: "dbcheck_batch_deadline", nodes: 2}); +replTest.startSet(); +replTest.initiate(); + +const primary = replTest.getPrimary(); +const db = primary.getDB('test'); +const coll = db.c; +const healthlog = primary.getDB('local').system.healthlog; + +// Populate collection. +const collCount = 3; +for (let i = 0; i < collCount; i++) { + assert.commandWorked(coll.insert({a: i})); +} + +// Run dbCheck with a failpoint configured so that we're only ever able to process 1 document per +// batch before hitting the 1-second default maxBatchTimeMillis. +const fp = configureFailPoint(primary, 'SleepDbCheckInBatch', {sleepMs: 2000}); +const timesEntered = fp.count; +assert.commandWorked(db.runCommand({dbCheck: coll.getName()})); + +// Wait for dbCheck to complete and disable the failpoint. +assert.soon(function() { + // Expecting one entry per document, plus a last (maxKey) entry. + const expectedHealthLogEntries = collCount + 1; + return (healthlog.find().itcount() == expectedHealthLogEntries); +}, "dbCheck command didn't complete - missing healthlog entries", 30 * 1000); +fp.off(); + +// Confirm each batch consists of 1 document, except for the last (maxKey) batch being empty. +assert.eq(collCount, + healthlog + .find({ + operation: "dbCheckBatch", + namespace: coll.getFullName(), + msg: "dbCheck batch consistent", + "data.count": 1 + }) + .itcount()); +assert.eq(1, + healthlog + .find({ + operation: "dbCheckBatch", + namespace: coll.getFullName(), + msg: "dbCheck batch consistent", + "data.count": 0 + }) + .itcount()); + +replTest.stopSet(); +})(); diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp index 70ef441902f..abcae164b8c 100644 --- a/src/mongo/db/commands/dbcheck.cpp +++ b/src/mongo/db/commands/dbcheck.cpp @@ -68,6 +68,7 @@ struct DbCheckCollectionInfo { int64_t maxCount; int64_t maxSize; int64_t maxRate; + int64_t maxBatchTimeMillis; }; /** @@ -89,12 +90,14 @@ std::unique_ptr<DbCheckRun> singleCollectionRun(OperationContext* opCtx, "Cannot run dbCheck on " + nss.toString() + " because it is not replicated", nss.isReplicated()); - auto start = invocation.getMinKey(); - auto end = invocation.getMaxKey(); - auto maxCount = invocation.getMaxCount(); - auto maxSize = invocation.getMaxSize(); - auto maxRate = invocation.getMaxCountPerSecond(); - auto info = DbCheckCollectionInfo{nss, start, end, maxCount, maxSize, maxRate}; + const auto start = invocation.getMinKey(); + const auto end = invocation.getMaxKey(); + const auto maxCount = invocation.getMaxCount(); + const auto maxSize = invocation.getMaxSize(); + const auto maxRate = invocation.getMaxCountPerSecond(); + const auto maxBatchTimeMillis = invocation.getMaxBatchTimeMillis(); + const auto info = + DbCheckCollectionInfo{nss, start, end, maxCount, maxSize, maxRate, maxBatchTimeMillis}; auto result = std::make_unique<DbCheckRun>(); result->push_back(info); return result; @@ -111,12 +114,14 @@ std::unique_ptr<DbCheckRun> fullDatabaseRun(OperationContext* opCtx, const int64_t max = std::numeric_limits<int64_t>::max(); const auto rate = invocation.getMaxCountPerSecond(); + const auto maxBatchTimeMillis = invocation.getMaxBatchTimeMillis(); auto result = std::make_unique<DbCheckRun>(); auto perCollectionWork = [&](const CollectionPtr& coll) { if (!coll->ns().isReplicated() || coll->isClustered()) { return true; } - DbCheckCollectionInfo info{coll->ns(), BSONKey::min(), BSONKey::max(), max, max, rate}; + DbCheckCollectionInfo info{ + coll->ns(), BSONKey::min(), BSONKey::max(), max, max, rate, maxBatchTimeMillis}; result->push_back(info); return true; }; @@ -375,7 +380,8 @@ private: return e.toStatus(); } - Status status = hasher->hashAll(); + const auto deadline = Date_t::now() + Milliseconds(info.maxBatchTimeMillis); + Status status = hasher->hashAll(opCtx, deadline); if (!status.isOK()) { return status; @@ -504,6 +510,7 @@ public: " maxCount: <max number of docs>,\n" " maxSize: <max size of docs>,\n" " maxCountPerSecond: <max rate in docs/sec> } " + " maxBatchTimeMillis: <max time processing a batch in milliseconds> } " "to check a collection.\n" "Invoke with {dbCheck: 1} to check all collections in the database."; } diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp index 7cca8b6a1d2..ca6d1695b05 100644 --- a/src/mongo/db/repl/dbcheck.cpp +++ b/src/mongo/db/repl/dbcheck.cpp @@ -47,6 +47,8 @@ namespace mongo { +MONGO_FAIL_POINT_DEFINE(SleepDbCheckInBatch); + namespace { /* @@ -236,11 +238,17 @@ void maybeAppend(md5_state_t* state, const boost::optional<UUID>& uuid) { } } -Status DbCheckHasher::hashAll(void) { +Status DbCheckHasher::hashAll(OperationContext* opCtx, Date_t deadline) { BSONObj currentObj; PlanExecutor::ExecState lastState; while (PlanExecutor::ADVANCED == (lastState = _exec->getNext(¤tObj, nullptr))) { + + SleepDbCheckInBatch.execute([opCtx](const BSONObj& data) { + int sleepMs = data["sleepMs"].safeNumberInt(); + opCtx->sleepFor(Milliseconds(sleepMs)); + }); + if (!currentObj.hasField("_id")) { return Status(ErrorCodes::NoSuchKey, "Document missing _id"); } @@ -256,6 +264,10 @@ Status DbCheckHasher::hashAll(void) { _countSeen += 1; md5_append(&_state, md5Cast(currentObj.objdata()), currentObj.objsize()); + + if (Date_t::now() > deadline) { + break; + } } // If we got to the end of the collection, set the last key to MaxKey. @@ -339,7 +351,7 @@ Status dbCheckBatchOnSecondary(OperationContext* opCtx, // run the hasher. if (status.isOK()) { - status = hasher->hashAll(); + status = hasher->hashAll(opCtx); } // In case of an error, report it to the health log, diff --git a/src/mongo/db/repl/dbcheck.h b/src/mongo/db/repl/dbcheck.h index 67076d07191..150580e7aa1 100644 --- a/src/mongo/db/repl/dbcheck.h +++ b/src/mongo/db/repl/dbcheck.h @@ -114,9 +114,9 @@ public: int64_t maxBytes = std::numeric_limits<int64_t>::max()); /** - * Hash all of our documents. + * Hash all documents up to the deadline. */ - Status hashAll(void); + Status hashAll(OperationContext* opCtx, Date_t deadline = Date_t::max()); /** * Return the total hash of all documents seen so far. diff --git a/src/mongo/db/repl/dbcheck.idl b/src/mongo/db/repl/dbcheck.idl index c074eda2112..a262aa862d9 100644 --- a/src/mongo/db/repl/dbcheck.idl +++ b/src/mongo/db/repl/dbcheck.idl @@ -112,6 +112,12 @@ structs: maxCountPerSecond: type: safeInt64 default: "std::numeric_limits<int64_t>::max()" + maxBatchTimeMillis: + type: safeInt64 + default: 1000 + validator: + gte: 10 + lte: 20000 DbCheckAllInvocation: description: "Command object for database-wide form of dbCheck invocation" @@ -122,6 +128,12 @@ structs: maxCountPerSecond: type: safeInt64 default: "std::numeric_limits<int64_t>::max()" + maxBatchTimeMillis: + type: safeInt64 + default: 1000 + validator: + gte: 10 + lte: 20000 DbCheckOplogBatch: description: "Oplog entry for a dbCheck batch" |