summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosef Ahmad <josef.ahmad@mongodb.com>2021-12-15 09:12:12 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-18 10:12:13 +0000
commite4951d4deaa591bc82460e4de606900bed3702ca (patch)
tree727c58c92addc1b104846c000ffe6a411ea7c3fb
parentfcd42d15ec4b00f9ce6b358894147b4f330035c1 (diff)
downloadmongo-e4951d4deaa591bc82460e4de606900bed3702ca.tar.gz
SERVER-62041 Add a maximum batch execution time to dbCheck
(cherry picked from commit 18bf3b5fcf4e06fde48e3459d4020ce3db3fa29f)
-rw-r--r--jstests/noPassthrough/dbcheck_batch_deadline.js65
-rw-r--r--src/mongo/db/commands/dbcheck.cpp23
-rw-r--r--src/mongo/db/repl/dbcheck.cpp16
-rw-r--r--src/mongo/db/repl/dbcheck.h4
-rw-r--r--src/mongo/db/repl/dbcheck.idl12
5 files changed, 108 insertions, 12 deletions
diff --git a/jstests/noPassthrough/dbcheck_batch_deadline.js b/jstests/noPassthrough/dbcheck_batch_deadline.js
new file mode 100644
index 00000000000..e4516a4f4d9
--- /dev/null
+++ b/jstests/noPassthrough/dbcheck_batch_deadline.js
@@ -0,0 +1,65 @@
+/**
+ * Confirms that dbCheck stops processing a batch when reaching the deadline, and that
+ * the following batch resumes from where the previous one left off.
+ *
+ * @tags: [
+ * requires_journaling,
+ * requires_replication,
+ * ]
+ */
+
+(function() {
+"use strict";
+load("jstests/libs/fail_point_util.js");
+
+const replTest = new ReplSetTest({name: "dbcheck_batch_deadline", nodes: 2});
+replTest.startSet();
+replTest.initiate();
+
+const primary = replTest.getPrimary();
+const db = primary.getDB('test');
+const coll = db.c;
+const healthlog = primary.getDB('local').system.healthlog;
+
+// Populate collection.
+const collCount = 3;
+for (let i = 0; i < collCount; i++) {
+ assert.commandWorked(coll.insert({a: i}));
+}
+
+// Run dbCheck with a failpoint configured so that we're only ever able to process 1 document per
+// batch before hitting the 1-second default maxBatchTimeMillis.
+const fp = configureFailPoint(primary, 'SleepDbCheckInBatch', {sleepMs: 2000});
+const timesEntered = fp.count;
+assert.commandWorked(db.runCommand({dbCheck: coll.getName()}));
+
+// Wait for dbCheck to complete and disable the failpoint.
+assert.soon(function() {
+ // Expecting one entry per document, plus a last (maxKey) entry.
+ const expectedHealthLogEntries = collCount + 1;
+ return (healthlog.find().itcount() == expectedHealthLogEntries);
+}, "dbCheck command didn't complete - missing healthlog entries", 30 * 1000);
+fp.off();
+
+// Confirm each batch consists of 1 document, except for the last (maxKey) batch being empty.
+assert.eq(collCount,
+ healthlog
+ .find({
+ operation: "dbCheckBatch",
+ namespace: coll.getFullName(),
+ msg: "dbCheck batch consistent",
+ "data.count": 1
+ })
+ .itcount());
+assert.eq(1,
+ healthlog
+ .find({
+ operation: "dbCheckBatch",
+ namespace: coll.getFullName(),
+ msg: "dbCheck batch consistent",
+ "data.count": 0
+ })
+ .itcount());
+
+replTest.stopSet();
+})();
diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp
index 70ef441902f..abcae164b8c 100644
--- a/src/mongo/db/commands/dbcheck.cpp
+++ b/src/mongo/db/commands/dbcheck.cpp
@@ -68,6 +68,7 @@ struct DbCheckCollectionInfo {
int64_t maxCount;
int64_t maxSize;
int64_t maxRate;
+ int64_t maxBatchTimeMillis;
};
/**
@@ -89,12 +90,14 @@ std::unique_ptr<DbCheckRun> singleCollectionRun(OperationContext* opCtx,
"Cannot run dbCheck on " + nss.toString() + " because it is not replicated",
nss.isReplicated());
- auto start = invocation.getMinKey();
- auto end = invocation.getMaxKey();
- auto maxCount = invocation.getMaxCount();
- auto maxSize = invocation.getMaxSize();
- auto maxRate = invocation.getMaxCountPerSecond();
- auto info = DbCheckCollectionInfo{nss, start, end, maxCount, maxSize, maxRate};
+ const auto start = invocation.getMinKey();
+ const auto end = invocation.getMaxKey();
+ const auto maxCount = invocation.getMaxCount();
+ const auto maxSize = invocation.getMaxSize();
+ const auto maxRate = invocation.getMaxCountPerSecond();
+ const auto maxBatchTimeMillis = invocation.getMaxBatchTimeMillis();
+ const auto info =
+ DbCheckCollectionInfo{nss, start, end, maxCount, maxSize, maxRate, maxBatchTimeMillis};
auto result = std::make_unique<DbCheckRun>();
result->push_back(info);
return result;
@@ -111,12 +114,14 @@ std::unique_ptr<DbCheckRun> fullDatabaseRun(OperationContext* opCtx,
const int64_t max = std::numeric_limits<int64_t>::max();
const auto rate = invocation.getMaxCountPerSecond();
+ const auto maxBatchTimeMillis = invocation.getMaxBatchTimeMillis();
auto result = std::make_unique<DbCheckRun>();
auto perCollectionWork = [&](const CollectionPtr& coll) {
if (!coll->ns().isReplicated() || coll->isClustered()) {
return true;
}
- DbCheckCollectionInfo info{coll->ns(), BSONKey::min(), BSONKey::max(), max, max, rate};
+ DbCheckCollectionInfo info{
+ coll->ns(), BSONKey::min(), BSONKey::max(), max, max, rate, maxBatchTimeMillis};
result->push_back(info);
return true;
};
@@ -375,7 +380,8 @@ private:
return e.toStatus();
}
- Status status = hasher->hashAll();
+ const auto deadline = Date_t::now() + Milliseconds(info.maxBatchTimeMillis);
+ Status status = hasher->hashAll(opCtx, deadline);
if (!status.isOK()) {
return status;
@@ -504,6 +510,7 @@ public:
" maxCount: <max number of docs>,\n"
" maxSize: <max size of docs>,\n"
" maxCountPerSecond: <max rate in docs/sec> } "
+ " maxBatchTimeMillis: <max time processing a batch in milliseconds> } "
"to check a collection.\n"
"Invoke with {dbCheck: 1} to check all collections in the database.";
}
diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp
index 7cca8b6a1d2..ca6d1695b05 100644
--- a/src/mongo/db/repl/dbcheck.cpp
+++ b/src/mongo/db/repl/dbcheck.cpp
@@ -47,6 +47,8 @@
namespace mongo {
+MONGO_FAIL_POINT_DEFINE(SleepDbCheckInBatch);
+
namespace {
/*
@@ -236,11 +238,17 @@ void maybeAppend(md5_state_t* state, const boost::optional<UUID>& uuid) {
}
}
-Status DbCheckHasher::hashAll(void) {
+Status DbCheckHasher::hashAll(OperationContext* opCtx, Date_t deadline) {
BSONObj currentObj;
PlanExecutor::ExecState lastState;
while (PlanExecutor::ADVANCED == (lastState = _exec->getNext(&currentObj, nullptr))) {
+
+ SleepDbCheckInBatch.execute([opCtx](const BSONObj& data) {
+ int sleepMs = data["sleepMs"].safeNumberInt();
+ opCtx->sleepFor(Milliseconds(sleepMs));
+ });
+
if (!currentObj.hasField("_id")) {
return Status(ErrorCodes::NoSuchKey, "Document missing _id");
}
@@ -256,6 +264,10 @@ Status DbCheckHasher::hashAll(void) {
_countSeen += 1;
md5_append(&_state, md5Cast(currentObj.objdata()), currentObj.objsize());
+
+ if (Date_t::now() > deadline) {
+ break;
+ }
}
// If we got to the end of the collection, set the last key to MaxKey.
@@ -339,7 +351,7 @@ Status dbCheckBatchOnSecondary(OperationContext* opCtx,
// run the hasher.
if (status.isOK()) {
- status = hasher->hashAll();
+ status = hasher->hashAll(opCtx);
}
// In case of an error, report it to the health log,
diff --git a/src/mongo/db/repl/dbcheck.h b/src/mongo/db/repl/dbcheck.h
index 67076d07191..150580e7aa1 100644
--- a/src/mongo/db/repl/dbcheck.h
+++ b/src/mongo/db/repl/dbcheck.h
@@ -114,9 +114,9 @@ public:
int64_t maxBytes = std::numeric_limits<int64_t>::max());
/**
- * Hash all of our documents.
+ * Hash all documents up to the deadline.
*/
- Status hashAll(void);
+ Status hashAll(OperationContext* opCtx, Date_t deadline = Date_t::max());
/**
* Return the total hash of all documents seen so far.
diff --git a/src/mongo/db/repl/dbcheck.idl b/src/mongo/db/repl/dbcheck.idl
index c074eda2112..a262aa862d9 100644
--- a/src/mongo/db/repl/dbcheck.idl
+++ b/src/mongo/db/repl/dbcheck.idl
@@ -112,6 +112,12 @@ structs:
maxCountPerSecond:
type: safeInt64
default: "std::numeric_limits<int64_t>::max()"
+ maxBatchTimeMillis:
+ type: safeInt64
+ default: 1000
+ validator:
+ gte: 10
+ lte: 20000
DbCheckAllInvocation:
description: "Command object for database-wide form of dbCheck invocation"
@@ -122,6 +128,12 @@ structs:
maxCountPerSecond:
type: safeInt64
default: "std::numeric_limits<int64_t>::max()"
+ maxBatchTimeMillis:
+ type: safeInt64
+ default: 1000
+ validator:
+ gte: 10
+ lte: 20000
DbCheckOplogBatch:
description: "Oplog entry for a dbCheck batch"