summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Seyster <justin.seyster@mongodb.com>2017-08-18 15:28:13 -0400
committerJustin Seyster <justin.seyster@mongodb.com>2017-08-18 19:13:27 -0400
commit83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e (patch)
treeee81aa216726dfe9ab21fa7cf3b44e695653b82b
parentaef10829fc71cb41c54df5838e9e7e74d41d122b (diff)
downloadmongo-83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e.tar.gz
SERVER-30467 $_internalSchema{Min|Max}Length counts UTF-8 code points.
-rw-r--r--buildscripts/resmokeconfig/suites/json_schema.yml2
-rw-r--r--src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp54
-rw-r--r--src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp54
-rw-r--r--src/mongo/db/matcher/schema/expression_internal_schema_str_length.h7
-rw-r--r--src/mongo/db/pipeline/expression.cpp28
-rw-r--r--src/mongo/util/mongoutils/str.h22
6 files changed, 142 insertions, 25 deletions
diff --git a/buildscripts/resmokeconfig/suites/json_schema.yml b/buildscripts/resmokeconfig/suites/json_schema.yml
index 8e995e2fa23..04b15687c1a 100644
--- a/buildscripts/resmokeconfig/suites/json_schema.yml
+++ b/buildscripts/resmokeconfig/suites/json_schema.yml
@@ -5,7 +5,9 @@ selector:
- src/third_party/JSON-Schema-Test-Suite/tests/draft4/**/*.json
include_files:
- src/third_party/JSON-Schema-Test-Suite/tests/draft4/maximum.json
+ - src/third_party/JSON-Schema-Test-Suite/tests/draft4/maxLength.json
- src/third_party/JSON-Schema-Test-Suite/tests/draft4/minimum.json
+ - src/third_party/JSON-Schema-Test-Suite/tests/draft4/minLength.json
- src/third_party/JSON-Schema-Test-Suite/tests/draft4/multipleOf.json
- src/third_party/JSON-Schema-Test-Suite/tests/draft4/pattern.json
diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp
index c0fd8bade2f..63b744fc829 100644
--- a/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp
+++ b/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp
@@ -83,6 +83,60 @@ TEST(InternalSchemaMaxLengthMatchExpression, RejectsNull) {
ASSERT_FALSE(maxLength.matchesBSON(BSON("a" << BSONNULL)));
}
+TEST(InternalSchemaMaxLengthMatchExpression, TreatsMultiByteCodepointAsOneCharacter) {
+ InternalSchemaMaxLengthMatchExpression nonMatchingMaxLength;
+ InternalSchemaMaxLengthMatchExpression matchingMaxLength;
+
+ ASSERT_OK(nonMatchingMaxLength.init("a", 0));
+ ASSERT_OK(matchingMaxLength.init("a", 1));
+
+ // This string has one code point, so it should meet maximum length 1 but not maximum length 0.
+ constexpr auto testString = u8"\U0001f4a9";
+ ASSERT_FALSE(nonMatchingMaxLength.matchesBSON(BSON("a" << testString)));
+ ASSERT_TRUE(matchingMaxLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMaxLengthMatchExpression, CorectlyCountsUnicodeCodepoints) {
+ InternalSchemaMaxLengthMatchExpression nonMatchingMaxLength;
+ InternalSchemaMaxLengthMatchExpression matchingMaxLength;
+
+ ASSERT_OK(nonMatchingMaxLength.init("a", 4));
+ ASSERT_OK(matchingMaxLength.init("a", 5));
+
+ // A test string that contains single-byte, 2-byte, 3-byte, and 4-byte codepoints.
+ constexpr auto testString =
+ u8":" // Single-byte character
+ u8"\u00e9" // 2-byte character
+ u8")" // Single-byte character
+ u8"\U0001f4a9" // 4-byte character
+ u8"\U000020ac"; // 3-byte character
+
+ // This string has five code points, so it should meet maximum length 5 but not maximum
+ // length 4.
+ ASSERT_FALSE(nonMatchingMaxLength.matchesBSON(BSON("a" << testString)));
+ ASSERT_TRUE(matchingMaxLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMaxLengthMatchExpression, DealsWithInvalidUTF8) {
+ InternalSchemaMaxLengthMatchExpression maxLength;
+
+ ASSERT_OK(maxLength.init("a", 1));
+
+ // Several kinds of invalid byte sequences listed in the Wikipedia article about UTF-8:
+ // https://en.wikipedia.org/wiki/UTF-8
+ constexpr auto testStringUnexpectedContinuationByte = "\bf";
+ constexpr auto testStringOverlongEncoding = "\xf0\x82\x82\xac";
+ constexpr auto testStringInvalidCodePoint = "\xed\xa0\x80"; // U+d800 is not allowed
+ constexpr auto testStringLeadingByteWithoutContinuationByte = "\xdf";
+
+ // Because these inputs are invalid, we don't have any expectations about the answers we get.
+ // Our only requirement is that the test does not crash.
+ std::ignore = maxLength.matchesBSON(BSON("a" << testStringUnexpectedContinuationByte));
+ std::ignore = maxLength.matchesBSON(BSON("a" << testStringOverlongEncoding));
+ std::ignore = maxLength.matchesBSON(BSON("a" << testStringInvalidCodePoint));
+ std::ignore = maxLength.matchesBSON(BSON("a" << testStringLeadingByteWithoutContinuationByte));
+}
+
TEST(InternalSchemaMaxLengthMatchExpression, NestedArraysWorkWithDottedPaths) {
InternalSchemaMaxLengthMatchExpression maxLength;
ASSERT_OK(maxLength.init("a.b", 2));
diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp
index 39beedd6bb3..2e72f8ea2ac 100644
--- a/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp
+++ b/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp
@@ -82,6 +82,60 @@ TEST(InternalSchemaMinLengthMatchExpression, RejectsNull) {
ASSERT_FALSE(minLength.matchesBSON(BSON("a" << BSONNULL)));
}
+TEST(InternalSchemaMinLengthMatchExpression, TreatsMultiByteCodepointAsOneCharacter) {
+ InternalSchemaMinLengthMatchExpression matchingMinLength;
+ InternalSchemaMinLengthMatchExpression nonMatchingMinLength;
+
+ ASSERT_OK(matchingMinLength.init("a", 1));
+ ASSERT_OK(nonMatchingMinLength.init("a", 2));
+
+ // This string has one code point, so it should meet minimum length 1 but not minimum length 2.
+ constexpr auto testString = u8"\U0001f4a9";
+ ASSERT_TRUE(matchingMinLength.matchesBSON(BSON("a" << testString)));
+ ASSERT_FALSE(nonMatchingMinLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMinLengthMatchExpression, CorectlyCountsUnicodeCodepoints) {
+ InternalSchemaMinLengthMatchExpression matchingMinLength;
+ InternalSchemaMinLengthMatchExpression nonMatchingMinLength;
+
+ ASSERT_OK(matchingMinLength.init("a", 5));
+ ASSERT_OK(nonMatchingMinLength.init("a", 6));
+
+ // A test string that contains single-byte, 2-byte, 3-byte, and 4-byte code points.
+ constexpr auto testString =
+ u8":" // Single-byte character
+ u8"\u00e9" // 2-byte character
+ u8")" // Single-byte character
+ u8"\U0001f4a9" // 4-byte character
+ u8"\U000020ac"; // 3-byte character
+
+ // This string has five code points, so it should meet minimum length 5 but not minimum
+ // length 6.
+ ASSERT_TRUE(matchingMinLength.matchesBSON(BSON("a" << testString)));
+ ASSERT_FALSE(nonMatchingMinLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMinLengthMatchExpression, DealsWithInvalidUTF8) {
+ InternalSchemaMinLengthMatchExpression minLength;
+
+ ASSERT_OK(minLength.init("a", 1));
+
+ // Several kinds of invalid byte sequences listed in the Wikipedia article about UTF-8:
+ // https://en.wikipedia.org/wiki/UTF-8
+ constexpr auto testStringUnexpectedContinuationByte = "\bf";
+ constexpr auto testStringOverlongEncoding = "\xf0\x82\x82\xac";
+ constexpr auto testStringInvalidCodePoint = "\xed\xa0\x80"; // U+d800 is not allowed
+ constexpr auto testStringLeadingByteWithoutContinuationByte = "\xdf";
+
+ // Because these inputs are invalid, we don't have any expectations about the answers we get.
+ // Our only requirement is that the test does not crash.
+ std::ignore = minLength.matchesBSON(BSON("a" << testStringUnexpectedContinuationByte));
+ std::ignore = minLength.matchesBSON(BSON("a" << testStringOverlongEncoding));
+ std::ignore = minLength.matchesBSON(BSON("a" << testStringInvalidCodePoint));
+ std::ignore = minLength.matchesBSON(BSON("a" << testStringLeadingByteWithoutContinuationByte));
+}
+
TEST(InternalSchemaMinLengthMatchExpression, NestedFieldsWorkWithDottedPaths) {
InternalSchemaMinLengthMatchExpression minLength;
ASSERT_OK(minLength.init("a.b", 2));
diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h b/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h
index 5ff2ff7f52d..f83c6d584dc 100644
--- a/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h
+++ b/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h
@@ -56,11 +56,8 @@ public:
return false;
}
- // BSONElement::valuestrsize() includes the terminating null character, so subtract 1 before
- // comparing its length.
- int lenWithoutNullTerm = elem.valuestrsize() - 1;
-
- return getComparator()(lenWithoutNullTerm);
+ auto len = str::lengthInUTF8CodePoints(elem.valueStringData());
+ return getComparator()(len);
};
void debugString(StringBuilder& debug, int level) const final;
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index d1bdaf6c983..c5877f5eee1 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -171,15 +171,6 @@ intrusive_ptr<Expression> Expression::parseOperand(
namespace {
/**
* UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
- * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
- * continuation byte.
- */
-bool isContinuationByte(char charByte) {
- return (charByte & 0xc0) == 0x80;
-}
-
-/**
- * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
* many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading
* byte.
*/
@@ -2692,8 +2683,9 @@ Value ExpressionIndexOfCP::evaluate(const Document& root) const {
startByteIndex = byteIx;
}
- uassert(
- 40095, "$indexOfCP found bad UTF-8 in the input", !isContinuationByte(input[byteIx]));
+ uassert(40095,
+ "$indexOfCP found bad UTF-8 in the input",
+ !str::isUTF8ContinuationByte(input[byteIx]));
byteIx += getCodePointLength(input[byteIx]);
}
@@ -3882,7 +3874,7 @@ Value ExpressionSubstrBytes::evaluate(const Document& root) const {
uassert(28656,
str::stream() << getOpName()
<< ": Invalid range, starting index is a UTF-8 continuation byte.",
- (lower >= str.length() || !isContinuationByte(str[lower])));
+ (lower >= str.length() || !str::isUTF8ContinuationByte(str[lower])));
// Check the byte after the last character we'd return. If it is a continuation byte, that
// means we're in the middle of a UTF-8 character.
@@ -3890,7 +3882,7 @@ Value ExpressionSubstrBytes::evaluate(const Document& root) const {
28657,
str::stream() << getOpName()
<< ": Invalid range, ending index is in the middle of a UTF-8 character.",
- (lower + length >= str.length() || !isContinuationByte(str[lower + length])));
+ (lower + length >= str.length() || !str::isUTF8ContinuationByte(str[lower + length])));
if (lower >= str.length()) {
// If lower > str.length() then string::substr() will throw out_of_range, so return an
@@ -3955,7 +3947,7 @@ Value ExpressionSubstrCP::evaluate(const Document& root) const {
}
uassert(34456,
str::stream() << getOpName() << ": invalid UTF-8 string",
- !isContinuationByte(str[startIndexBytes]));
+ !str::isUTF8ContinuationByte(str[startIndexBytes]));
size_t codePointLength = getCodePointLength(str[startIndexBytes]);
uassert(
34457, str::stream() << getOpName() << ": invalid UTF-8 string", codePointLength <= 4);
@@ -3967,7 +3959,7 @@ Value ExpressionSubstrCP::evaluate(const Document& root) const {
for (int i = 0; i < length && endIndexBytes < str.size(); i++) {
uassert(34458,
str::stream() << getOpName() << ": invalid UTF-8 string",
- !isContinuationByte(str[endIndexBytes]));
+ !str::isUTF8ContinuationByte(str[endIndexBytes]));
size_t codePointLength = getCodePointLength(str[endIndexBytes]);
uassert(
34459, str::stream() << getOpName() << ": invalid UTF-8 string", codePointLength <= 4);
@@ -4016,11 +4008,7 @@ Value ExpressionStrLenCP::evaluate(const Document& root) const {
val.getType() == String);
std::string stringVal = val.getString();
-
- size_t strLen = 0;
- for (char byte : stringVal) {
- strLen += !isContinuationByte(byte);
- }
+ size_t strLen = str::lengthInUTF8CodePoints(stringVal);
uassert(34472,
"string length could not be represented as an int.",
diff --git a/src/mongo/util/mongoutils/str.h b/src/mongo/util/mongoutils/str.h
index c1967003506..d85ddcb2abf 100644
--- a/src/mongo/util/mongoutils/str.h
+++ b/src/mongo/util/mongoutils/str.h
@@ -239,6 +239,28 @@ inline std::string ltrim(const std::string& s) {
return p;
}
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
+ * continuation byte.
+ */
+inline bool isUTF8ContinuationByte(char charByte) {
+ return (charByte & 0xc0) == 0x80;
+}
+
+/**
+ * Assuming 'str' stores a UTF-8 string, returns the number of UTF codepoints. The return value is
+ * undefined if the input is not a well formed UTF-8 string.
+ */
+inline size_t lengthInUTF8CodePoints(mongo::StringData str) {
+ size_t strLen = 0;
+ for (char byte : str) {
+ strLen += !isUTF8ContinuationByte(byte);
+ }
+
+ return strLen;
+}
+
} // namespace str
} // namespace mongoutils