diff options
author | Damian Wasilewicz <damian.wasilewicz@mongodb.com> | 2022-08-20 07:33:08 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-08-20 08:06:36 +0000 |
commit | 5b393e3066017a285646f9d50b91b5b1d9cb5a08 (patch) | |
tree | 558110a2859546d664af1707e175e0d726fc3aca | |
parent | 37b1dd33a1238cb5cdfe4a18d8d71fda4fd04e5f (diff) | |
download | mongo-5b393e3066017a285646f9d50b91b5b1d9cb5a08.tar.gz |
SERVER-67562 Check for invalid UTF-8 characters in BSON documents during validation
-rw-r--r-- | jstests/disk/libs/wt_file_helper.js | 20 | ||||
-rw-r--r-- | jstests/disk/validate_bson_inconsistency.js | 30 | ||||
-rw-r--r-- | src/mongo/bson/bson_validate.cpp | 22 | ||||
-rw-r--r-- | src/mongo/bson/bson_validate_test.cpp | 57 | ||||
-rw-r--r-- | src/mongo/util/str_escape.cpp | 22 | ||||
-rw-r--r-- | src/mongo/util/str_escape.h | 5 |
6 files changed, 151 insertions, 5 deletions
diff --git a/jstests/disk/libs/wt_file_helper.js b/jstests/disk/libs/wt_file_helper.js index fa84110856d..e9924139a54 100644 --- a/jstests/disk/libs/wt_file_helper.js +++ b/jstests/disk/libs/wt_file_helper.js @@ -372,4 +372,24 @@ let insertInvalidRegex = function(coll, mongod, nDocuments) { } }; rewriteTable(getUriForColl(coll), mongod, swapOptions); +}; + +/** + * Inserts document with invalid UTF-8 string into the MongoDB server. + */ +let insertInvalidUTF8 = function(coll, uri, conn, numDocs) { + for (let i = 0; i < numDocs; ++i) { + coll.insert({validString: "\x70"}); + } + let makeInvalidUTF8 = function(lines) { + // The offset of the first byte of the string, flips \x70 to \x80 (10000000) - invalid + // because single byte UTF-8 cannot have a leading 1. + const offsetToString = 76; + // Each record takes two lines with a key and a value. We will only modify the values. + for (let i = wtHeaderLines; i < lines.length; i += 2) { + lines[i] = lines[i].substring(0, offsetToString) + "8" + + lines[i].substring(offsetToString + 1); + } + }; + rewriteTable(uri, conn, makeInvalidUTF8); };
\ No newline at end of file diff --git a/jstests/disk/validate_bson_inconsistency.js b/jstests/disk/validate_bson_inconsistency.js index 91e406f6f25..9ea0e222033 100644 --- a/jstests/disk/validate_bson_inconsistency.js +++ b/jstests/disk/validate_bson_inconsistency.js @@ -248,4 +248,34 @@ resetDbpath(dbpath); MongoRunner.stopMongod(mongod, null, {skipValidation: true}); })(); + +(function validateDocumentsInvalidUTF8() { + jsTestLog("Validate documents with invalid UTF-8 strings"); + + let mongod = startMongodOnExistingPath(dbpath); + let db = mongod.getDB(baseName); + const collName = collNamePrefix + count++; + db.createCollection(collName); + let testColl = db[collName]; + + let uri = getUriForColl(testColl); + const numDocs = 10; + insertInvalidUTF8(testColl, uri, mongod, numDocs); + + mongod = startMongodOnExistingPath(dbpath); + db = mongod.getDB(baseName); + testColl = db[collName]; + + res = assert.commandWorked(testColl.validate()); + assert(res.valid, tojson(res)); + assert.eq(res.nNonCompliantDocuments, 0); + assert.eq(res.warnings.length, 0); + + res = assert.commandWorked(testColl.validate({checkBSONConsistency: true})); + assert(res.valid, tojson(res)); + assert.eq(res.nNonCompliantDocuments, 10); + assert.eq(res.warnings.length, 1); + + MongoRunner.stopMongod(mongod, null, {skipValidation: true}); +})(); })(); diff --git a/src/mongo/bson/bson_validate.cpp b/src/mongo/bson/bson_validate.cpp index 370dc14dac6..f0bb0d038b5 100644 --- a/src/mongo/bson/bson_validate.cpp +++ b/src/mongo/bson/bson_validate.cpp @@ -37,6 +37,7 @@ #include "mongo/bson/bsonelement.h" #include "mongo/bson/util/bsoncolumn.h" #include "mongo/logv2/log.h" +#include "mongo/util/str_escape.h" #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault @@ -89,8 +90,6 @@ class DefaultValidator { public: void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {} - void checkUTF8Char() {} - void checkDuplicateFieldName() {} void popLevel() {} @@ -157,8 +156,6 @@ public: } } - void checkUTF8Char() {} - void checkDuplicateFieldName() {} void popLevel() { @@ -226,6 +223,8 @@ public: void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) { registerFieldName(ptr + 1); ExtendedValidator::checkNonConformantElem(ptr, offsetToValue, type); + // Check the field name is UTF-8 encoded. + checkUTF8Char(ptr + 1); switch (type) { case BSONType::Array: { objFrames.push_back({std::vector<std::string>(), false}); @@ -248,9 +247,13 @@ public: uasserted(NonConformantBSON, "Exception ocurred while decompressing a BSON column."); } - break; } } + break; + } + case BSONType::String: { + // Increment pointer to actual value and then four more to skip size. + checkUTF8Char(ptr + offsetToValue + 4); } } } @@ -287,6 +290,15 @@ private: objFrames.back().first.emplace_back(str); }; } + +private: + void checkUTF8Char(const char* ptr) { + try { + str::checkInvalidUTF8(ptr); + } catch (const ExceptionFor<ErrorCodes::BadValue>&) { + uasserted(NonConformantBSON, "Found string that doesn't follow UTF-8 encoding."); + } + } }; template <bool precise, typename BSONValidator> diff --git a/src/mongo/bson/bson_validate_test.cpp b/src/mongo/bson/bson_validate_test.cpp index ad097ffbc43..8f4176bddc2 100644 --- a/src/mongo/bson/bson_validate_test.cpp +++ b/src/mongo/bson/bson_validate_test.cpp @@ -333,6 +333,63 @@ TEST(BSONValidateExtended, BSONArrayIndexes) { ASSERT_EQ(status, ErrorCodes::NonConformantBSON); } +TEST(BSONValidateExtended, BSONUTF8) { + auto x1 = BSON("ValidString" + << "\x00" + << "ValidString2" + << "str"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 10000000; leading bit cannot be set for single byte UTF-8. + x1 = BSON("InvalidOneByteString" + << "\x80"); + auto status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); + + x1 = BSON("ValidTwoByteString" + << "\x40\x40"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 11011111 11001111; second bit of second byte cannot be set. + x1 = BSON("InvalidTwoByteString" + << "\xDF\xCF"); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); + + x1 = BSON("ValidThreeByteString" + << "\x40\x40\x40"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 11101111 10111111 11111111 - second bit of third byte cannot be set. + x1 = BSON("InvalidThreeByteString" + << "\xEF\xBF\xFF"); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); + + x1 = BSON("ValidFourByteString" + << "\x40\x40\x40\x40"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 11110000 10011000 10011010 11111111 - second bit of fourth byte cannot be + // set. + x1 = BSON("InvalidFourByteString" + << "\xF0\x98\x9A\xFF"); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); +} + TEST(BSONValidateFast, Empty) { BSONObj x; ASSERT_OK(validateBSON(x)); diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp index c42a916bab0..9dbb479af67 100644 --- a/src/mongo/util/str_escape.cpp +++ b/src/mongo/util/str_escape.cpp @@ -33,6 +33,8 @@ #include <array> #include <iterator> +#include "mongo/util/assert_util.h" + namespace mongo::str { namespace { constexpr char kHexChar[] = "0123456789abcdef"; @@ -496,4 +498,24 @@ std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite) escapeForJSONCommon(buffer, str, maxLength, wouldWrite); return buffer; } + +void checkInvalidUTF8(StringData str, size_t maxLength, size_t* wouldWrite) { + // No-op buffer and handlers, defined to re-use escape method logic. + std::string buffer; + auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {}; + auto twoByteEscaper = [](const auto& writer, uint8_t first, uint8_t second) {}; + + // Throws an exception when an invalid UTF8 character is detected. + auto invalidByteHandler = [](const auto& writer, uint8_t) { + uasserted(ErrorCodes::BadValue, "Invalid UTF-8 Character"); + }; + + escape(buffer, + str, + std::move(singleByteHandler), + std::move(invalidByteHandler), + std::move(twoByteEscaper), + maxLength, + wouldWrite); +} } // namespace mongo::str diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h index 2d82e5697cd..14b89128a30 100644 --- a/src/mongo/util/str_escape.h +++ b/src/mongo/util/str_escape.h @@ -106,4 +106,9 @@ void escapeForJSON(fmt::memory_buffer& buffer, std::string escapeForJSON(StringData str, size_t maxLength = std::string::npos, size_t* wouldWrite = nullptr); + + +void checkInvalidUTF8(StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); } // namespace mongo::str |