summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDamian Wasilewicz <damian.wasilewicz@mongodb.com>2022-08-20 07:33:08 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-08-20 08:06:36 +0000
commit5b393e3066017a285646f9d50b91b5b1d9cb5a08 (patch)
tree558110a2859546d664af1707e175e0d726fc3aca
parent37b1dd33a1238cb5cdfe4a18d8d71fda4fd04e5f (diff)
downloadmongo-5b393e3066017a285646f9d50b91b5b1d9cb5a08.tar.gz
SERVER-67562 Check for invalid UTF-8 characters in BSON documents during validation
-rw-r--r--jstests/disk/libs/wt_file_helper.js20
-rw-r--r--jstests/disk/validate_bson_inconsistency.js30
-rw-r--r--src/mongo/bson/bson_validate.cpp22
-rw-r--r--src/mongo/bson/bson_validate_test.cpp57
-rw-r--r--src/mongo/util/str_escape.cpp22
-rw-r--r--src/mongo/util/str_escape.h5
6 files changed, 151 insertions, 5 deletions
diff --git a/jstests/disk/libs/wt_file_helper.js b/jstests/disk/libs/wt_file_helper.js
index fa84110856d..e9924139a54 100644
--- a/jstests/disk/libs/wt_file_helper.js
+++ b/jstests/disk/libs/wt_file_helper.js
@@ -372,4 +372,24 @@ let insertInvalidRegex = function(coll, mongod, nDocuments) {
}
};
rewriteTable(getUriForColl(coll), mongod, swapOptions);
+};
+
+/**
+ * Inserts document with invalid UTF-8 string into the MongoDB server.
+ */
+let insertInvalidUTF8 = function(coll, uri, conn, numDocs) {
+ for (let i = 0; i < numDocs; ++i) {
+ coll.insert({validString: "\x70"});
+ }
+ let makeInvalidUTF8 = function(lines) {
+ // The offset of the first byte of the string, flips \x70 to \x80 (10000000) - invalid
+ // because single byte UTF-8 cannot have a leading 1.
+ const offsetToString = 76;
+ // Each record takes two lines with a key and a value. We will only modify the values.
+ for (let i = wtHeaderLines; i < lines.length; i += 2) {
+ lines[i] = lines[i].substring(0, offsetToString) + "8" +
+ lines[i].substring(offsetToString + 1);
+ }
+ };
+ rewriteTable(uri, conn, makeInvalidUTF8);
}; \ No newline at end of file
diff --git a/jstests/disk/validate_bson_inconsistency.js b/jstests/disk/validate_bson_inconsistency.js
index 91e406f6f25..9ea0e222033 100644
--- a/jstests/disk/validate_bson_inconsistency.js
+++ b/jstests/disk/validate_bson_inconsistency.js
@@ -248,4 +248,34 @@ resetDbpath(dbpath);
MongoRunner.stopMongod(mongod, null, {skipValidation: true});
})();
+
+(function validateDocumentsInvalidUTF8() {
+ jsTestLog("Validate documents with invalid UTF-8 strings");
+
+ let mongod = startMongodOnExistingPath(dbpath);
+ let db = mongod.getDB(baseName);
+ const collName = collNamePrefix + count++;
+ db.createCollection(collName);
+ let testColl = db[collName];
+
+ let uri = getUriForColl(testColl);
+ const numDocs = 10;
+ insertInvalidUTF8(testColl, uri, mongod, numDocs);
+
+ mongod = startMongodOnExistingPath(dbpath);
+ db = mongod.getDB(baseName);
+ testColl = db[collName];
+
+ res = assert.commandWorked(testColl.validate());
+ assert(res.valid, tojson(res));
+ assert.eq(res.nNonCompliantDocuments, 0);
+ assert.eq(res.warnings.length, 0);
+
+ res = assert.commandWorked(testColl.validate({checkBSONConsistency: true}));
+ assert(res.valid, tojson(res));
+ assert.eq(res.nNonCompliantDocuments, 10);
+ assert.eq(res.warnings.length, 1);
+
+ MongoRunner.stopMongod(mongod, null, {skipValidation: true});
+})();
})();
diff --git a/src/mongo/bson/bson_validate.cpp b/src/mongo/bson/bson_validate.cpp
index 370dc14dac6..f0bb0d038b5 100644
--- a/src/mongo/bson/bson_validate.cpp
+++ b/src/mongo/bson/bson_validate.cpp
@@ -37,6 +37,7 @@
#include "mongo/bson/bsonelement.h"
#include "mongo/bson/util/bsoncolumn.h"
#include "mongo/logv2/log.h"
+#include "mongo/util/str_escape.h"
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
@@ -89,8 +90,6 @@ class DefaultValidator {
public:
void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {}
- void checkUTF8Char() {}
-
void checkDuplicateFieldName() {}
void popLevel() {}
@@ -157,8 +156,6 @@ public:
}
}
- void checkUTF8Char() {}
-
void checkDuplicateFieldName() {}
void popLevel() {
@@ -226,6 +223,8 @@ public:
void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {
registerFieldName(ptr + 1);
ExtendedValidator::checkNonConformantElem(ptr, offsetToValue, type);
+ // Check the field name is UTF-8 encoded.
+ checkUTF8Char(ptr + 1);
switch (type) {
case BSONType::Array: {
objFrames.push_back({std::vector<std::string>(), false});
@@ -248,9 +247,13 @@ public:
uasserted(NonConformantBSON,
"Exception ocurred while decompressing a BSON column.");
}
- break;
}
}
+ break;
+ }
+ case BSONType::String: {
+ // Increment pointer to actual value and then four more to skip size.
+ checkUTF8Char(ptr + offsetToValue + 4);
}
}
}
@@ -287,6 +290,15 @@ private:
objFrames.back().first.emplace_back(str);
};
}
+
+private:
+ void checkUTF8Char(const char* ptr) {
+ try {
+ str::checkInvalidUTF8(ptr);
+ } catch (const ExceptionFor<ErrorCodes::BadValue>&) {
+ uasserted(NonConformantBSON, "Found string that doesn't follow UTF-8 encoding.");
+ }
+ }
};
template <bool precise, typename BSONValidator>
diff --git a/src/mongo/bson/bson_validate_test.cpp b/src/mongo/bson/bson_validate_test.cpp
index ad097ffbc43..8f4176bddc2 100644
--- a/src/mongo/bson/bson_validate_test.cpp
+++ b/src/mongo/bson/bson_validate_test.cpp
@@ -333,6 +333,63 @@ TEST(BSONValidateExtended, BSONArrayIndexes) {
ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
}
+TEST(BSONValidateExtended, BSONUTF8) {
+ auto x1 = BSON("ValidString"
+ << "\x00"
+ << "ValidString2"
+ << "str");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 10000000; leading bit cannot be set for single byte UTF-8.
+ x1 = BSON("InvalidOneByteString"
+ << "\x80");
+ auto status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+
+ x1 = BSON("ValidTwoByteString"
+ << "\x40\x40");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 11011111 11001111; second bit of second byte cannot be set.
+ x1 = BSON("InvalidTwoByteString"
+ << "\xDF\xCF");
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+
+ x1 = BSON("ValidThreeByteString"
+ << "\x40\x40\x40");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 11101111 10111111 11111111 - second bit of third byte cannot be set.
+ x1 = BSON("InvalidThreeByteString"
+ << "\xEF\xBF\xFF");
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+
+ x1 = BSON("ValidFourByteString"
+ << "\x40\x40\x40\x40");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 11110000 10011000 10011010 11111111 - second bit of fourth byte cannot be
+ // set.
+ x1 = BSON("InvalidFourByteString"
+ << "\xF0\x98\x9A\xFF");
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+}
+
TEST(BSONValidateFast, Empty) {
BSONObj x;
ASSERT_OK(validateBSON(x));
diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp
index c42a916bab0..9dbb479af67 100644
--- a/src/mongo/util/str_escape.cpp
+++ b/src/mongo/util/str_escape.cpp
@@ -33,6 +33,8 @@
#include <array>
#include <iterator>
+#include "mongo/util/assert_util.h"
+
namespace mongo::str {
namespace {
constexpr char kHexChar[] = "0123456789abcdef";
@@ -496,4 +498,24 @@ std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite)
escapeForJSONCommon(buffer, str, maxLength, wouldWrite);
return buffer;
}
+
+void checkInvalidUTF8(StringData str, size_t maxLength, size_t* wouldWrite) {
+ // No-op buffer and handlers, defined to re-use escape method logic.
+ std::string buffer;
+ auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {};
+ auto twoByteEscaper = [](const auto& writer, uint8_t first, uint8_t second) {};
+
+ // Throws an exception when an invalid UTF8 character is detected.
+ auto invalidByteHandler = [](const auto& writer, uint8_t) {
+ uasserted(ErrorCodes::BadValue, "Invalid UTF-8 Character");
+ };
+
+ escape(buffer,
+ str,
+ std::move(singleByteHandler),
+ std::move(invalidByteHandler),
+ std::move(twoByteEscaper),
+ maxLength,
+ wouldWrite);
+}
} // namespace mongo::str
diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h
index 2d82e5697cd..14b89128a30 100644
--- a/src/mongo/util/str_escape.h
+++ b/src/mongo/util/str_escape.h
@@ -106,4 +106,9 @@ void escapeForJSON(fmt::memory_buffer& buffer,
std::string escapeForJSON(StringData str,
size_t maxLength = std::string::npos,
size_t* wouldWrite = nullptr);
+
+
+void checkInvalidUTF8(StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
} // namespace mongo::str