diff options
author | Damian Wasilewicz <damian.wasilewicz@mongodb.com> | 2022-08-20 07:33:08 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-08-20 08:06:36 +0000 |
commit | 5b393e3066017a285646f9d50b91b5b1d9cb5a08 (patch) | |
tree | 558110a2859546d664af1707e175e0d726fc3aca /src/mongo | |
parent | 37b1dd33a1238cb5cdfe4a18d8d71fda4fd04e5f (diff) | |
download | mongo-5b393e3066017a285646f9d50b91b5b1d9cb5a08.tar.gz |
SERVER-67562 Check for invalid UTF-8 characters in BSON documents during validation
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/bson/bson_validate.cpp | 22 | ||||
-rw-r--r-- | src/mongo/bson/bson_validate_test.cpp | 57 | ||||
-rw-r--r-- | src/mongo/util/str_escape.cpp | 22 | ||||
-rw-r--r-- | src/mongo/util/str_escape.h | 5 |
4 files changed, 101 insertions, 5 deletions
diff --git a/src/mongo/bson/bson_validate.cpp b/src/mongo/bson/bson_validate.cpp index 370dc14dac6..f0bb0d038b5 100644 --- a/src/mongo/bson/bson_validate.cpp +++ b/src/mongo/bson/bson_validate.cpp @@ -37,6 +37,7 @@ #include "mongo/bson/bsonelement.h" #include "mongo/bson/util/bsoncolumn.h" #include "mongo/logv2/log.h" +#include "mongo/util/str_escape.h" #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault @@ -89,8 +90,6 @@ class DefaultValidator { public: void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {} - void checkUTF8Char() {} - void checkDuplicateFieldName() {} void popLevel() {} @@ -157,8 +156,6 @@ public: } } - void checkUTF8Char() {} - void checkDuplicateFieldName() {} void popLevel() { @@ -226,6 +223,8 @@ public: void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) { registerFieldName(ptr + 1); ExtendedValidator::checkNonConformantElem(ptr, offsetToValue, type); + // Check the field name is UTF-8 encoded. + checkUTF8Char(ptr + 1); switch (type) { case BSONType::Array: { objFrames.push_back({std::vector<std::string>(), false}); @@ -248,9 +247,13 @@ public: uasserted(NonConformantBSON, "Exception ocurred while decompressing a BSON column."); } - break; } } + break; + } + case BSONType::String: { + // Increment pointer to actual value and then four more to skip size. + checkUTF8Char(ptr + offsetToValue + 4); } } } @@ -287,6 +290,15 @@ private: objFrames.back().first.emplace_back(str); }; } + +private: + void checkUTF8Char(const char* ptr) { + try { + str::checkInvalidUTF8(ptr); + } catch (const ExceptionFor<ErrorCodes::BadValue>&) { + uasserted(NonConformantBSON, "Found string that doesn't follow UTF-8 encoding."); + } + } }; template <bool precise, typename BSONValidator> diff --git a/src/mongo/bson/bson_validate_test.cpp b/src/mongo/bson/bson_validate_test.cpp index ad097ffbc43..8f4176bddc2 100644 --- a/src/mongo/bson/bson_validate_test.cpp +++ b/src/mongo/bson/bson_validate_test.cpp @@ -333,6 +333,63 @@ TEST(BSONValidateExtended, BSONArrayIndexes) { ASSERT_EQ(status, ErrorCodes::NonConformantBSON); } +TEST(BSONValidateExtended, BSONUTF8) { + auto x1 = BSON("ValidString" + << "\x00" + << "ValidString2" + << "str"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 10000000; leading bit cannot be set for single byte UTF-8. + x1 = BSON("InvalidOneByteString" + << "\x80"); + auto status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); + + x1 = BSON("ValidTwoByteString" + << "\x40\x40"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 11011111 11001111; second bit of second byte cannot be set. + x1 = BSON("InvalidTwoByteString" + << "\xDF\xCF"); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); + + x1 = BSON("ValidThreeByteString" + << "\x40\x40\x40"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 11101111 10111111 11111111 - second bit of third byte cannot be set. + x1 = BSON("InvalidThreeByteString" + << "\xEF\xBF\xFF"); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); + + x1 = BSON("ValidFourByteString" + << "\x40\x40\x40\x40"); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended)); + ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull)); + + // Invalid UTF-8 - 11110000 10011000 10011010 11111111 - second bit of fourth byte cannot be + // set. + x1 = BSON("InvalidFourByteString" + << "\xF0\x98\x9A\xFF"); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended); + ASSERT_OK(status); + status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull); + ASSERT_EQ(status, ErrorCodes::NonConformantBSON); +} + TEST(BSONValidateFast, Empty) { BSONObj x; ASSERT_OK(validateBSON(x)); diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp index c42a916bab0..9dbb479af67 100644 --- a/src/mongo/util/str_escape.cpp +++ b/src/mongo/util/str_escape.cpp @@ -33,6 +33,8 @@ #include <array> #include <iterator> +#include "mongo/util/assert_util.h" + namespace mongo::str { namespace { constexpr char kHexChar[] = "0123456789abcdef"; @@ -496,4 +498,24 @@ std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite) escapeForJSONCommon(buffer, str, maxLength, wouldWrite); return buffer; } + +void checkInvalidUTF8(StringData str, size_t maxLength, size_t* wouldWrite) { + // No-op buffer and handlers, defined to re-use escape method logic. + std::string buffer; + auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {}; + auto twoByteEscaper = [](const auto& writer, uint8_t first, uint8_t second) {}; + + // Throws an exception when an invalid UTF8 character is detected. + auto invalidByteHandler = [](const auto& writer, uint8_t) { + uasserted(ErrorCodes::BadValue, "Invalid UTF-8 Character"); + }; + + escape(buffer, + str, + std::move(singleByteHandler), + std::move(invalidByteHandler), + std::move(twoByteEscaper), + maxLength, + wouldWrite); +} } // namespace mongo::str diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h index 2d82e5697cd..14b89128a30 100644 --- a/src/mongo/util/str_escape.h +++ b/src/mongo/util/str_escape.h @@ -106,4 +106,9 @@ void escapeForJSON(fmt::memory_buffer& buffer, std::string escapeForJSON(StringData str, size_t maxLength = std::string::npos, size_t* wouldWrite = nullptr); + + +void checkInvalidUTF8(StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); } // namespace mongo::str |