SERVER-72512 SERVER-73636 Ensure validate surfaces index key inconsistencies if they exist

(cherry picked from commit c0e1438cad31e00372a9f9edaee80f9db5a4e8ed) (cherry picked from commit 2d90115a227572cc0f46b309ceb47dda6b740da3)
author: Dan Larkin-York <dan.larkin-york@mongodb.com> 2023-02-03 23:45:23 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-02-11 14:05:59 +0000
commit: bcf0c75c8dbed2a84ad69ba5bd7aa200115f8cb9 (patch)
tree: 1addd97ed85c13bf8619f31760091a7d4dc80a2c
parent: 78f09d6c5a0a4c814335e5ec7608d9b0c5099092 (diff)
download: mongo-bcf0c75c8dbed2a84ad69ba5bd7aa200115f8cb9.tar.gz
3 files changed, 135 insertions, 25 deletions
diff --git a/jstests/disk/libs/wt_file_helper.js b/jstests/disk/libs/wt_file_helper.js
index 1d2cd673596..4a8b136e938 100644
--- a/jstests/disk/libs/wt_file_helper.js
+++ b/jstests/disk/libs/wt_file_helper.js
@@ -207,3 +207,13 @@ let runWiredTigerTool = function(...args) {
     const cmd = ['wt'].concat(args);
     assert.eq(run.apply(undefined, cmd), 0, "error executing: " + cmd.join(' '));
 };
+
+/**
+ * Stops the given mongod, runs the truncate command on the given uri using the WiredTiger tool, and
+ * starts mongod again on the same path.
+ */
+let truncateUriAndRestartMongod = function(uri, conn) {
+    MongoRunner.stopMongod(conn, null, {skipValidation: true});
+    runWiredTigerTool("-h", conn.dbpath, "truncate", uri);
+    return startMongodOnExistingPath(conn.dbpath, {});
+};
+\ No newline at end of file
diff --git a/jstests/noPassthrough/validate_memory_limit.js b/jstests/noPassthrough/validate_memory_limit.js
new file mode 100644
index 00000000000..a3903e36905
--- /dev/null
+++ b/jstests/noPassthrough/validate_memory_limit.js
@@ -0,0 +1,58 @@
+/**
+ * Test that the memory usage of validate is properly limited according to the
+ * maxValidateMemoryUsageMB parameter.
+ *
+ * @tags: [requires_wiredtiger, requires_persistence]
+ */
+(function() {
+"use strict";
+
+load("jstests/disk/libs/wt_file_helper.js");
+
+const kIndexKeyLength = 1024 * 1024;
+
+const baseName = "validate_memory_limit";
+const dbpath = MongoRunner.dataPath + baseName + "/";
+let conn = MongoRunner.runMongod({dbpath: dbpath});
+let coll = conn.getDB("test").getCollection("corrupt");
+
+function corruptIndex() {
+    const uri = getUriForIndex(coll, "_id_");
+    conn = truncateUriAndRestartMongod(uri, conn);
+    coll = conn.getDB("test").getCollection("corrupt");
+}
+
+function checkValidate(maxMemoryUsage, {minMissingKeys, maxMissingKeys}) {
+    conn.getDB("test").adminCommand({setParameter: 1, maxValidateMemoryUsageMB: maxMemoryUsage});
+    const res = coll.validate({full: true});
+    assert.commandWorked(res);
+    assert(!res.valid, tojson(res));
+    assert.gte(res.missingIndexEntries.length, minMissingKeys, tojson(res));
+    assert.lte(res.missingIndexEntries.length, maxMissingKeys, tojson(res));
+}
+
+// Insert a document with a key larger than maxValidateMemoryUsageMB and test that we still report
+// at least one inconsistency.
+const indexKey = "a".repeat(kIndexKeyLength);
+assert.commandWorked(coll.insert({_id: indexKey}));
+corruptIndex();
+checkValidate(1, {minMissingKeys: 1, maxMissingKeys: 1});
+
+// Clear collection between tests.
+coll.drop();
+
+// Test that if we have keys distributed across many buckets, and would exceed
+// maxValidateMemoryUsageMB, we report as many inconsistencies as we can.
+for (let i = 0; i < 10; ++i) {
+    const indexKey = i.toString().repeat(kIndexKeyLength / 5);
+    assert.commandWorked(coll.insert({_id: indexKey}));
+}
+
+corruptIndex();
+// If each key is maxMem/5, then we can keep 4 of them (the 5th would put us at the limit). However,
+// each key is counted twice, so realistically we only expect to track 2 of them. However, there's
+// a small chance we could get hash collisions that would lead to us reporting only 1.
+checkValidate(1, {minMissingKeys: 1, maxMissingKeys: 2});
+
+MongoRunner.stopMongod(conn, null, {skipValidation: true});
+})();
diff --git a/src/mongo/db/catalog/index_consistency.cpp b/src/mongo/db/catalog/index_consistency.cpp
index da72e768898..bde9394b2f7 100644
--- a/src/mongo/db/catalog/index_consistency.cpp
+++ b/src/mongo/db/catalog/index_consistency.cpp
@@ -155,15 +155,31 @@ void IndexConsistency::addIndexEntryErrors(ValidateResultsMap* indexNsResultsMap
         numExtraIndexEntryErrors += item.second.size();
     }
 
-    // Inform which indexes have inconsistences and add the BSON objects of the inconsistent index
+    // Sort missing index entries by size so we can process in order of increasing size and return
+    // as many as possible within memory limits.
+    using MissingIt = decltype(_missingIndexEntries)::const_iterator;
+    std::vector<MissingIt> missingIndexEntriesBySize;
+    missingIndexEntriesBySize.reserve(_missingIndexEntries.size());
+    for (auto it = _missingIndexEntries.begin(); it != _missingIndexEntries.end(); ++it) {
+        missingIndexEntriesBySize.push_back(it);
+    }
+    std::sort(missingIndexEntriesBySize.begin(),
+              missingIndexEntriesBySize.end(),
+              [](const MissingIt& a, const MissingIt& b) {
+                  return a->second.objsize() < b->second.objsize();
+              });
+
+    // Inform which indexes have inconsistencies and add the BSON objects of the inconsistent index
     // entries to the results vector.
     bool missingIndexEntrySizeLimitWarning = false;
-    for (const auto& missingIndexEntry : _missingIndexEntries) {
-        const BSONObj& entry = missingIndexEntry.second;
+    bool first = true;
+    for (const auto& it : missingIndexEntriesBySize) {
+        const auto& entry = it->second;
 
         numMissingIndexEntriesSizeBytes += entry.objsize();
-        if (numMissingIndexEntriesSizeBytes <= kErrorSizeBytes) {
+        if (first || numMissingIndexEntriesSizeBytes <= kErrorSizeBytes) {
             results->missingIndexEntries.push_back(entry);
+            first = false;
         } else if (!missingIndexEntrySizeLimitWarning) {
             StringBuilder ss;
             ss << "Not all missing index entry inconsistencies are listed due to size limitations.";
@@ -184,33 +200,58 @@ void IndexConsistency::addIndexEntryErrors(ValidateResultsMap* indexNsResultsMap
         indexNsResultsMap->at(indexName).valid = false;
     }
 
-    bool extraIndexEntrySizeLimitWarning = false;
+    // Sort extra index entries by size so we can process in order of increasing size and return as
+    // many as possible within memory limits.
+    using ExtraIt = SimpleBSONObjSet::const_iterator;
+    std::vector<ExtraIt> extraIndexEntriesBySize;
+    // Since the extra entries are stored in a map of sets, we have to iterate the entries in the
+    // map and sum the size of the sets in order to get the total number. Given that we can have at
+    // most 64 indexes per collection, and the total number of entries could potentially be in the
+    // millions, we expect that iterating the map will be much less costly than the additional
+    // allocations and copies that could result from not calling 'reserve' on the vector.
+    size_t totalExtraIndexEntriesCount =
+        std::accumulate(_extraIndexEntries.begin(),
+                        _extraIndexEntries.end(),
+                        0,
+                        [](size_t total, const std::pair<IndexKey, SimpleBSONObjSet>& set) {
+                            return total + set.second.size();
+                        });
+    extraIndexEntriesBySize.reserve(totalExtraIndexEntriesCount);
     for (const auto& extraIndexEntry : _extraIndexEntries) {
         const SimpleBSONObjSet& entries = extraIndexEntry.second;
-        for (const auto& entry : entries) {
-            numExtraIndexEntriesSizeBytes += entry.objsize();
-            if (numExtraIndexEntriesSizeBytes <= kErrorSizeBytes) {
-                results->extraIndexEntries.push_back(entry);
-            } else if (!extraIndexEntrySizeLimitWarning) {
-                StringBuilder ss;
-                ss << "Not all extra index entry inconsistencies are listed due to size "
-                      "limitations.";
-                results->errors.push_back(ss.str());
-
-                extraIndexEntrySizeLimitWarning = true;
-            }
-
-            std::string indexName = entry["indexName"].String();
-            if (!indexNsResultsMap->at(indexName).valid) {
-                continue;
-            }
+        for (auto it = entries.begin(); it != entries.end(); ++it) {
+            extraIndexEntriesBySize.push_back(it);
+        }
+    }
+    std::sort(extraIndexEntriesBySize.begin(),
+              extraIndexEntriesBySize.end(),
+              [](const ExtraIt& a, const ExtraIt& b) { return a->objsize() < b->objsize(); });
 
+    bool extraIndexEntrySizeLimitWarning = false;
+    for (const auto& entry : extraIndexEntriesBySize) {
+        numExtraIndexEntriesSizeBytes += entry->objsize();
+        if (first || numExtraIndexEntriesSizeBytes <= kErrorSizeBytes) {
+            results->extraIndexEntries.push_back(*entry);
+            first = false;
+        } else if (!extraIndexEntrySizeLimitWarning) {
             StringBuilder ss;
-            ss << "Index with name '" << indexName << "' has inconsistencies.";
+            ss << "Not all extra index entry inconsistencies are listed due to size "
+                  "limitations.";
             results->errors.push_back(ss.str());
 
-            indexNsResultsMap->at(indexName).valid = false;
+            extraIndexEntrySizeLimitWarning = true;
         }
+
+        std::string indexName = (*entry)["indexName"].String();
+        if (!indexNsResultsMap->at(indexName).valid) {
+            continue;
+        }
+
+        StringBuilder ss;
+        ss << "Index with name '" << indexName << "' has inconsistencies.";
+        results->errors.push_back(ss.str());
+
+        indexNsResultsMap->at(indexName).valid = false;
     }
 
     // Inform how many inconsistencies were detected.
@@ -256,7 +297,8 @@ void IndexConsistency::addDocKey(const KeyString& ks,
 
         BSONObj info = _generateInfo(*indexInfo, recordId, indexKey, idKey);
 
-        // Cannot have duplicate KeyStrings during the document scan phase for the same index.
+        // Cannot have duplicate KeyStrings during the document scan phase for the same
+        // index.
         IndexKey key = _generateKeyForMap(*indexInfo, ks);
         invariant(_missingIndexEntries.count(key) == 0);
         _missingIndexEntries.insert(std::make_pair(key, info));
author	Dan Larkin-York <dan.larkin-york@mongodb.com>	2023-02-03 23:45:23 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-02-11 14:05:59 +0000
commit	bcf0c75c8dbed2a84ad69ba5bd7aa200115f8cb9 (patch)
tree	1addd97ed85c13bf8619f31760091a7d4dc80a2c
parent	78f09d6c5a0a4c814335e5ec7608d9b0c5099092 (diff)
download	mongo-bcf0c75c8dbed2a84ad69ba5bd7aa200115f8cb9.tar.gz