summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorDamian Wasilewicz <damian.wasilewicz@mongodb.com>2022-08-20 07:33:08 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-08-20 08:06:36 +0000
commit5b393e3066017a285646f9d50b91b5b1d9cb5a08 (patch)
tree558110a2859546d664af1707e175e0d726fc3aca /src/mongo
parent37b1dd33a1238cb5cdfe4a18d8d71fda4fd04e5f (diff)
downloadmongo-5b393e3066017a285646f9d50b91b5b1d9cb5a08.tar.gz
SERVER-67562 Check for invalid UTF-8 characters in BSON documents during validation
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/bson/bson_validate.cpp22
-rw-r--r--src/mongo/bson/bson_validate_test.cpp57
-rw-r--r--src/mongo/util/str_escape.cpp22
-rw-r--r--src/mongo/util/str_escape.h5
4 files changed, 101 insertions, 5 deletions
diff --git a/src/mongo/bson/bson_validate.cpp b/src/mongo/bson/bson_validate.cpp
index 370dc14dac6..f0bb0d038b5 100644
--- a/src/mongo/bson/bson_validate.cpp
+++ b/src/mongo/bson/bson_validate.cpp
@@ -37,6 +37,7 @@
#include "mongo/bson/bsonelement.h"
#include "mongo/bson/util/bsoncolumn.h"
#include "mongo/logv2/log.h"
+#include "mongo/util/str_escape.h"
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
@@ -89,8 +90,6 @@ class DefaultValidator {
public:
void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {}
- void checkUTF8Char() {}
-
void checkDuplicateFieldName() {}
void popLevel() {}
@@ -157,8 +156,6 @@ public:
}
}
- void checkUTF8Char() {}
-
void checkDuplicateFieldName() {}
void popLevel() {
@@ -226,6 +223,8 @@ public:
void checkNonConformantElem(const char* ptr, uint32_t offsetToValue, uint8_t type) {
registerFieldName(ptr + 1);
ExtendedValidator::checkNonConformantElem(ptr, offsetToValue, type);
+ // Check the field name is UTF-8 encoded.
+ checkUTF8Char(ptr + 1);
switch (type) {
case BSONType::Array: {
objFrames.push_back({std::vector<std::string>(), false});
@@ -248,9 +247,13 @@ public:
uasserted(NonConformantBSON,
"Exception ocurred while decompressing a BSON column.");
}
- break;
}
}
+ break;
+ }
+ case BSONType::String: {
+ // Increment pointer to actual value and then four more to skip size.
+ checkUTF8Char(ptr + offsetToValue + 4);
}
}
}
@@ -287,6 +290,15 @@ private:
objFrames.back().first.emplace_back(str);
};
}
+
+private:
+ void checkUTF8Char(const char* ptr) {
+ try {
+ str::checkInvalidUTF8(ptr);
+ } catch (const ExceptionFor<ErrorCodes::BadValue>&) {
+ uasserted(NonConformantBSON, "Found string that doesn't follow UTF-8 encoding.");
+ }
+ }
};
template <bool precise, typename BSONValidator>
diff --git a/src/mongo/bson/bson_validate_test.cpp b/src/mongo/bson/bson_validate_test.cpp
index ad097ffbc43..8f4176bddc2 100644
--- a/src/mongo/bson/bson_validate_test.cpp
+++ b/src/mongo/bson/bson_validate_test.cpp
@@ -333,6 +333,63 @@ TEST(BSONValidateExtended, BSONArrayIndexes) {
ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
}
+TEST(BSONValidateExtended, BSONUTF8) {
+ auto x1 = BSON("ValidString"
+ << "\x00"
+ << "ValidString2"
+ << "str");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 10000000; leading bit cannot be set for single byte UTF-8.
+ x1 = BSON("InvalidOneByteString"
+ << "\x80");
+ auto status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+
+ x1 = BSON("ValidTwoByteString"
+ << "\x40\x40");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 11011111 11001111; second bit of second byte cannot be set.
+ x1 = BSON("InvalidTwoByteString"
+ << "\xDF\xCF");
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+
+ x1 = BSON("ValidThreeByteString"
+ << "\x40\x40\x40");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 11101111 10111111 11111111 - second bit of third byte cannot be set.
+ x1 = BSON("InvalidThreeByteString"
+ << "\xEF\xBF\xFF");
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+
+ x1 = BSON("ValidFourByteString"
+ << "\x40\x40\x40\x40");
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended));
+ ASSERT_OK(validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull));
+
+ // Invalid UTF-8 - 11110000 10011000 10011010 11111111 - second bit of fourth byte cannot be
+ // set.
+ x1 = BSON("InvalidFourByteString"
+ << "\xF0\x98\x9A\xFF");
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kExtended);
+ ASSERT_OK(status);
+ status = validateBSON(x1.objdata(), x1.objsize(), mongo::BSONValidateMode::kFull);
+ ASSERT_EQ(status, ErrorCodes::NonConformantBSON);
+}
+
TEST(BSONValidateFast, Empty) {
BSONObj x;
ASSERT_OK(validateBSON(x));
diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp
index c42a916bab0..9dbb479af67 100644
--- a/src/mongo/util/str_escape.cpp
+++ b/src/mongo/util/str_escape.cpp
@@ -33,6 +33,8 @@
#include <array>
#include <iterator>
+#include "mongo/util/assert_util.h"
+
namespace mongo::str {
namespace {
constexpr char kHexChar[] = "0123456789abcdef";
@@ -496,4 +498,24 @@ std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite)
escapeForJSONCommon(buffer, str, maxLength, wouldWrite);
return buffer;
}
+
+void checkInvalidUTF8(StringData str, size_t maxLength, size_t* wouldWrite) {
+ // No-op buffer and handlers, defined to re-use escape method logic.
+ std::string buffer;
+ auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {};
+ auto twoByteEscaper = [](const auto& writer, uint8_t first, uint8_t second) {};
+
+ // Throws an exception when an invalid UTF8 character is detected.
+ auto invalidByteHandler = [](const auto& writer, uint8_t) {
+ uasserted(ErrorCodes::BadValue, "Invalid UTF-8 Character");
+ };
+
+ escape(buffer,
+ str,
+ std::move(singleByteHandler),
+ std::move(invalidByteHandler),
+ std::move(twoByteEscaper),
+ maxLength,
+ wouldWrite);
+}
} // namespace mongo::str
diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h
index 2d82e5697cd..14b89128a30 100644
--- a/src/mongo/util/str_escape.h
+++ b/src/mongo/util/str_escape.h
@@ -106,4 +106,9 @@ void escapeForJSON(fmt::memory_buffer& buffer,
std::string escapeForJSON(StringData str,
size_t maxLength = std::string::npos,
size_t* wouldWrite = nullptr);
+
+
+void checkInvalidUTF8(StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
} // namespace mongo::str