diff options
author | Justin Seyster <justin.seyster@mongodb.com> | 2017-08-18 15:28:13 -0400 |
---|---|---|
committer | Justin Seyster <justin.seyster@mongodb.com> | 2017-08-18 19:13:27 -0400 |
commit | 83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e (patch) | |
tree | ee81aa216726dfe9ab21fa7cf3b44e695653b82b | |
parent | aef10829fc71cb41c54df5838e9e7e74d41d122b (diff) | |
download | mongo-83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e.tar.gz |
SERVER-30467 $_internalSchema{Min|Max}Length counts UTF-8 code points.
6 files changed, 142 insertions, 25 deletions
diff --git a/buildscripts/resmokeconfig/suites/json_schema.yml b/buildscripts/resmokeconfig/suites/json_schema.yml index 8e995e2fa23..04b15687c1a 100644 --- a/buildscripts/resmokeconfig/suites/json_schema.yml +++ b/buildscripts/resmokeconfig/suites/json_schema.yml @@ -5,7 +5,9 @@ selector: - src/third_party/JSON-Schema-Test-Suite/tests/draft4/**/*.json include_files: - src/third_party/JSON-Schema-Test-Suite/tests/draft4/maximum.json + - src/third_party/JSON-Schema-Test-Suite/tests/draft4/maxLength.json - src/third_party/JSON-Schema-Test-Suite/tests/draft4/minimum.json + - src/third_party/JSON-Schema-Test-Suite/tests/draft4/minLength.json - src/third_party/JSON-Schema-Test-Suite/tests/draft4/multipleOf.json - src/third_party/JSON-Schema-Test-Suite/tests/draft4/pattern.json diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp index c0fd8bade2f..63b744fc829 100644 --- a/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp +++ b/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp @@ -83,6 +83,60 @@ TEST(InternalSchemaMaxLengthMatchExpression, RejectsNull) { ASSERT_FALSE(maxLength.matchesBSON(BSON("a" << BSONNULL))); } +TEST(InternalSchemaMaxLengthMatchExpression, TreatsMultiByteCodepointAsOneCharacter) { + InternalSchemaMaxLengthMatchExpression nonMatchingMaxLength; + InternalSchemaMaxLengthMatchExpression matchingMaxLength; + + ASSERT_OK(nonMatchingMaxLength.init("a", 0)); + ASSERT_OK(matchingMaxLength.init("a", 1)); + + // This string has one code point, so it should meet maximum length 1 but not maximum length 0. + constexpr auto testString = u8"\U0001f4a9"; + ASSERT_FALSE(nonMatchingMaxLength.matchesBSON(BSON("a" << testString))); + ASSERT_TRUE(matchingMaxLength.matchesBSON(BSON("a" << testString))); +} + +TEST(InternalSchemaMaxLengthMatchExpression, CorectlyCountsUnicodeCodepoints) { + InternalSchemaMaxLengthMatchExpression nonMatchingMaxLength; + InternalSchemaMaxLengthMatchExpression matchingMaxLength; + + ASSERT_OK(nonMatchingMaxLength.init("a", 4)); + ASSERT_OK(matchingMaxLength.init("a", 5)); + + // A test string that contains single-byte, 2-byte, 3-byte, and 4-byte codepoints. + constexpr auto testString = + u8":" // Single-byte character + u8"\u00e9" // 2-byte character + u8")" // Single-byte character + u8"\U0001f4a9" // 4-byte character + u8"\U000020ac"; // 3-byte character + + // This string has five code points, so it should meet maximum length 5 but not maximum + // length 4. + ASSERT_FALSE(nonMatchingMaxLength.matchesBSON(BSON("a" << testString))); + ASSERT_TRUE(matchingMaxLength.matchesBSON(BSON("a" << testString))); +} + +TEST(InternalSchemaMaxLengthMatchExpression, DealsWithInvalidUTF8) { + InternalSchemaMaxLengthMatchExpression maxLength; + + ASSERT_OK(maxLength.init("a", 1)); + + // Several kinds of invalid byte sequences listed in the Wikipedia article about UTF-8: + // https://en.wikipedia.org/wiki/UTF-8 + constexpr auto testStringUnexpectedContinuationByte = "\bf"; + constexpr auto testStringOverlongEncoding = "\xf0\x82\x82\xac"; + constexpr auto testStringInvalidCodePoint = "\xed\xa0\x80"; // U+d800 is not allowed + constexpr auto testStringLeadingByteWithoutContinuationByte = "\xdf"; + + // Because these inputs are invalid, we don't have any expectations about the answers we get. + // Our only requirement is that the test does not crash. + std::ignore = maxLength.matchesBSON(BSON("a" << testStringUnexpectedContinuationByte)); + std::ignore = maxLength.matchesBSON(BSON("a" << testStringOverlongEncoding)); + std::ignore = maxLength.matchesBSON(BSON("a" << testStringInvalidCodePoint)); + std::ignore = maxLength.matchesBSON(BSON("a" << testStringLeadingByteWithoutContinuationByte)); +} + TEST(InternalSchemaMaxLengthMatchExpression, NestedArraysWorkWithDottedPaths) { InternalSchemaMaxLengthMatchExpression maxLength; ASSERT_OK(maxLength.init("a.b", 2)); diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp index 39beedd6bb3..2e72f8ea2ac 100644 --- a/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp +++ b/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp @@ -82,6 +82,60 @@ TEST(InternalSchemaMinLengthMatchExpression, RejectsNull) { ASSERT_FALSE(minLength.matchesBSON(BSON("a" << BSONNULL))); } +TEST(InternalSchemaMinLengthMatchExpression, TreatsMultiByteCodepointAsOneCharacter) { + InternalSchemaMinLengthMatchExpression matchingMinLength; + InternalSchemaMinLengthMatchExpression nonMatchingMinLength; + + ASSERT_OK(matchingMinLength.init("a", 1)); + ASSERT_OK(nonMatchingMinLength.init("a", 2)); + + // This string has one code point, so it should meet minimum length 1 but not minimum length 2. + constexpr auto testString = u8"\U0001f4a9"; + ASSERT_TRUE(matchingMinLength.matchesBSON(BSON("a" << testString))); + ASSERT_FALSE(nonMatchingMinLength.matchesBSON(BSON("a" << testString))); +} + +TEST(InternalSchemaMinLengthMatchExpression, CorectlyCountsUnicodeCodepoints) { + InternalSchemaMinLengthMatchExpression matchingMinLength; + InternalSchemaMinLengthMatchExpression nonMatchingMinLength; + + ASSERT_OK(matchingMinLength.init("a", 5)); + ASSERT_OK(nonMatchingMinLength.init("a", 6)); + + // A test string that contains single-byte, 2-byte, 3-byte, and 4-byte code points. + constexpr auto testString = + u8":" // Single-byte character + u8"\u00e9" // 2-byte character + u8")" // Single-byte character + u8"\U0001f4a9" // 4-byte character + u8"\U000020ac"; // 3-byte character + + // This string has five code points, so it should meet minimum length 5 but not minimum + // length 6. + ASSERT_TRUE(matchingMinLength.matchesBSON(BSON("a" << testString))); + ASSERT_FALSE(nonMatchingMinLength.matchesBSON(BSON("a" << testString))); +} + +TEST(InternalSchemaMinLengthMatchExpression, DealsWithInvalidUTF8) { + InternalSchemaMinLengthMatchExpression minLength; + + ASSERT_OK(minLength.init("a", 1)); + + // Several kinds of invalid byte sequences listed in the Wikipedia article about UTF-8: + // https://en.wikipedia.org/wiki/UTF-8 + constexpr auto testStringUnexpectedContinuationByte = "\bf"; + constexpr auto testStringOverlongEncoding = "\xf0\x82\x82\xac"; + constexpr auto testStringInvalidCodePoint = "\xed\xa0\x80"; // U+d800 is not allowed + constexpr auto testStringLeadingByteWithoutContinuationByte = "\xdf"; + + // Because these inputs are invalid, we don't have any expectations about the answers we get. + // Our only requirement is that the test does not crash. + std::ignore = minLength.matchesBSON(BSON("a" << testStringUnexpectedContinuationByte)); + std::ignore = minLength.matchesBSON(BSON("a" << testStringOverlongEncoding)); + std::ignore = minLength.matchesBSON(BSON("a" << testStringInvalidCodePoint)); + std::ignore = minLength.matchesBSON(BSON("a" << testStringLeadingByteWithoutContinuationByte)); +} + TEST(InternalSchemaMinLengthMatchExpression, NestedFieldsWorkWithDottedPaths) { InternalSchemaMinLengthMatchExpression minLength; ASSERT_OK(minLength.init("a.b", 2)); diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h b/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h index 5ff2ff7f52d..f83c6d584dc 100644 --- a/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h +++ b/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h @@ -56,11 +56,8 @@ public: return false; } - // BSONElement::valuestrsize() includes the terminating null character, so subtract 1 before - // comparing its length. - int lenWithoutNullTerm = elem.valuestrsize() - 1; - - return getComparator()(lenWithoutNullTerm); + auto len = str::lengthInUTF8CodePoints(elem.valueStringData()); + return getComparator()(len); }; void debugString(StringBuilder& debug, int level) const final; diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index d1bdaf6c983..c5877f5eee1 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -171,15 +171,6 @@ intrusive_ptr<Expression> Expression::parseOperand( namespace { /** * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially - * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a - * continuation byte. - */ -bool isContinuationByte(char charByte) { - return (charByte & 0xc0) == 0x80; -} - -/** - * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading * byte. */ @@ -2692,8 +2683,9 @@ Value ExpressionIndexOfCP::evaluate(const Document& root) const { startByteIndex = byteIx; } - uassert( - 40095, "$indexOfCP found bad UTF-8 in the input", !isContinuationByte(input[byteIx])); + uassert(40095, + "$indexOfCP found bad UTF-8 in the input", + !str::isUTF8ContinuationByte(input[byteIx])); byteIx += getCodePointLength(input[byteIx]); } @@ -3882,7 +3874,7 @@ Value ExpressionSubstrBytes::evaluate(const Document& root) const { uassert(28656, str::stream() << getOpName() << ": Invalid range, starting index is a UTF-8 continuation byte.", - (lower >= str.length() || !isContinuationByte(str[lower]))); + (lower >= str.length() || !str::isUTF8ContinuationByte(str[lower]))); // Check the byte after the last character we'd return. If it is a continuation byte, that // means we're in the middle of a UTF-8 character. @@ -3890,7 +3882,7 @@ Value ExpressionSubstrBytes::evaluate(const Document& root) const { 28657, str::stream() << getOpName() << ": Invalid range, ending index is in the middle of a UTF-8 character.", - (lower + length >= str.length() || !isContinuationByte(str[lower + length]))); + (lower + length >= str.length() || !str::isUTF8ContinuationByte(str[lower + length]))); if (lower >= str.length()) { // If lower > str.length() then string::substr() will throw out_of_range, so return an @@ -3955,7 +3947,7 @@ Value ExpressionSubstrCP::evaluate(const Document& root) const { } uassert(34456, str::stream() << getOpName() << ": invalid UTF-8 string", - !isContinuationByte(str[startIndexBytes])); + !str::isUTF8ContinuationByte(str[startIndexBytes])); size_t codePointLength = getCodePointLength(str[startIndexBytes]); uassert( 34457, str::stream() << getOpName() << ": invalid UTF-8 string", codePointLength <= 4); @@ -3967,7 +3959,7 @@ Value ExpressionSubstrCP::evaluate(const Document& root) const { for (int i = 0; i < length && endIndexBytes < str.size(); i++) { uassert(34458, str::stream() << getOpName() << ": invalid UTF-8 string", - !isContinuationByte(str[endIndexBytes])); + !str::isUTF8ContinuationByte(str[endIndexBytes])); size_t codePointLength = getCodePointLength(str[endIndexBytes]); uassert( 34459, str::stream() << getOpName() << ": invalid UTF-8 string", codePointLength <= 4); @@ -4016,11 +4008,7 @@ Value ExpressionStrLenCP::evaluate(const Document& root) const { val.getType() == String); std::string stringVal = val.getString(); - - size_t strLen = 0; - for (char byte : stringVal) { - strLen += !isContinuationByte(byte); - } + size_t strLen = str::lengthInUTF8CodePoints(stringVal); uassert(34472, "string length could not be represented as an int.", diff --git a/src/mongo/util/mongoutils/str.h b/src/mongo/util/mongoutils/str.h index c1967003506..d85ddcb2abf 100644 --- a/src/mongo/util/mongoutils/str.h +++ b/src/mongo/util/mongoutils/str.h @@ -239,6 +239,28 @@ inline std::string ltrim(const std::string& s) { return p; } +/** + * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially + * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a + * continuation byte. + */ +inline bool isUTF8ContinuationByte(char charByte) { + return (charByte & 0xc0) == 0x80; +} + +/** + * Assuming 'str' stores a UTF-8 string, returns the number of UTF codepoints. The return value is + * undefined if the input is not a well formed UTF-8 string. + */ +inline size_t lengthInUTF8CodePoints(mongo::StringData str) { + size_t strLen = 0; + for (char byte : str) { + strLen += !isUTF8ContinuationByte(byte); + } + + return strLen; +} + } // namespace str } // namespace mongoutils |