SERVER-30467 $_internalSchema{Min|Max}Length counts UTF-8 code points.

author: Justin Seyster <justin.seyster@mongodb.com> 2017-08-18 15:28:13 -0400
committer: Justin Seyster <justin.seyster@mongodb.com> 2017-08-18 19:13:27 -0400
commit: 83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e (patch)
tree: ee81aa216726dfe9ab21fa7cf3b44e695653b82b
parent: aef10829fc71cb41c54df5838e9e7e74d41d122b (diff)
download: mongo-83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e.tar.gz
6 files changed, 142 insertions, 25 deletions
diff --git a/buildscripts/resmokeconfig/suites/json_schema.yml b/buildscripts/resmokeconfig/suites/json_schema.yml
index 8e995e2fa23..04b15687c1a 100644
--- a/buildscripts/resmokeconfig/suites/json_schema.yml
+++ b/buildscripts/resmokeconfig/suites/json_schema.yml
@@ -5,7 +5,9 @@ selector:
   - src/third_party/JSON-Schema-Test-Suite/tests/draft4/**/*.json
   include_files:
   - src/third_party/JSON-Schema-Test-Suite/tests/draft4/maximum.json
+  - src/third_party/JSON-Schema-Test-Suite/tests/draft4/maxLength.json
   - src/third_party/JSON-Schema-Test-Suite/tests/draft4/minimum.json
+  - src/third_party/JSON-Schema-Test-Suite/tests/draft4/minLength.json
   - src/third_party/JSON-Schema-Test-Suite/tests/draft4/multipleOf.json
   - src/third_party/JSON-Schema-Test-Suite/tests/draft4/pattern.json
 
diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp
index c0fd8bade2f..63b744fc829 100644
--- a/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp
+++ b/src/mongo/db/matcher/schema/expression_internal_schema_max_length_test.cpp
@@ -83,6 +83,60 @@ TEST(InternalSchemaMaxLengthMatchExpression, RejectsNull) {
     ASSERT_FALSE(maxLength.matchesBSON(BSON("a" << BSONNULL)));
 }
 
+TEST(InternalSchemaMaxLengthMatchExpression, TreatsMultiByteCodepointAsOneCharacter) {
+    InternalSchemaMaxLengthMatchExpression nonMatchingMaxLength;
+    InternalSchemaMaxLengthMatchExpression matchingMaxLength;
+
+    ASSERT_OK(nonMatchingMaxLength.init("a", 0));
+    ASSERT_OK(matchingMaxLength.init("a", 1));
+
+    // This string has one code point, so it should meet maximum length 1 but not maximum length 0.
+    constexpr auto testString = u8"\U0001f4a9";
+    ASSERT_FALSE(nonMatchingMaxLength.matchesBSON(BSON("a" << testString)));
+    ASSERT_TRUE(matchingMaxLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMaxLengthMatchExpression, CorectlyCountsUnicodeCodepoints) {
+    InternalSchemaMaxLengthMatchExpression nonMatchingMaxLength;
+    InternalSchemaMaxLengthMatchExpression matchingMaxLength;
+
+    ASSERT_OK(nonMatchingMaxLength.init("a", 4));
+    ASSERT_OK(matchingMaxLength.init("a", 5));
+
+    // A test string that contains single-byte, 2-byte, 3-byte, and 4-byte codepoints.
+    constexpr auto testString =
+        u8":"            // Single-byte character
+        u8"\u00e9"       // 2-byte character
+        u8")"            // Single-byte character
+        u8"\U0001f4a9"   // 4-byte character
+        u8"\U000020ac";  // 3-byte character
+
+    // This string has five code points, so it should meet maximum length 5 but not maximum
+    // length 4.
+    ASSERT_FALSE(nonMatchingMaxLength.matchesBSON(BSON("a" << testString)));
+    ASSERT_TRUE(matchingMaxLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMaxLengthMatchExpression, DealsWithInvalidUTF8) {
+    InternalSchemaMaxLengthMatchExpression maxLength;
+
+    ASSERT_OK(maxLength.init("a", 1));
+
+    // Several kinds of invalid byte sequences listed in the Wikipedia article about UTF-8:
+    // https://en.wikipedia.org/wiki/UTF-8
+    constexpr auto testStringUnexpectedContinuationByte = "\bf";
+    constexpr auto testStringOverlongEncoding = "\xf0\x82\x82\xac";
+    constexpr auto testStringInvalidCodePoint = "\xed\xa0\x80";  // U+d800 is not allowed
+    constexpr auto testStringLeadingByteWithoutContinuationByte = "\xdf";
+
+    // Because these inputs are invalid, we don't have any expectations about the answers we get.
+    // Our only requirement is that the test does not crash.
+    std::ignore = maxLength.matchesBSON(BSON("a" << testStringUnexpectedContinuationByte));
+    std::ignore = maxLength.matchesBSON(BSON("a" << testStringOverlongEncoding));
+    std::ignore = maxLength.matchesBSON(BSON("a" << testStringInvalidCodePoint));
+    std::ignore = maxLength.matchesBSON(BSON("a" << testStringLeadingByteWithoutContinuationByte));
+}
+
 TEST(InternalSchemaMaxLengthMatchExpression, NestedArraysWorkWithDottedPaths) {
     InternalSchemaMaxLengthMatchExpression maxLength;
     ASSERT_OK(maxLength.init("a.b", 2));
diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp b/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp
index 39beedd6bb3..2e72f8ea2ac 100644
--- a/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp
+++ b/src/mongo/db/matcher/schema/expression_internal_schema_min_length_test.cpp
@@ -82,6 +82,60 @@ TEST(InternalSchemaMinLengthMatchExpression, RejectsNull) {
     ASSERT_FALSE(minLength.matchesBSON(BSON("a" << BSONNULL)));
 }
 
+TEST(InternalSchemaMinLengthMatchExpression, TreatsMultiByteCodepointAsOneCharacter) {
+    InternalSchemaMinLengthMatchExpression matchingMinLength;
+    InternalSchemaMinLengthMatchExpression nonMatchingMinLength;
+
+    ASSERT_OK(matchingMinLength.init("a", 1));
+    ASSERT_OK(nonMatchingMinLength.init("a", 2));
+
+    // This string has one code point, so it should meet minimum length 1 but not minimum length 2.
+    constexpr auto testString = u8"\U0001f4a9";
+    ASSERT_TRUE(matchingMinLength.matchesBSON(BSON("a" << testString)));
+    ASSERT_FALSE(nonMatchingMinLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMinLengthMatchExpression, CorectlyCountsUnicodeCodepoints) {
+    InternalSchemaMinLengthMatchExpression matchingMinLength;
+    InternalSchemaMinLengthMatchExpression nonMatchingMinLength;
+
+    ASSERT_OK(matchingMinLength.init("a", 5));
+    ASSERT_OK(nonMatchingMinLength.init("a", 6));
+
+    // A test string that contains single-byte, 2-byte, 3-byte, and 4-byte code points.
+    constexpr auto testString =
+        u8":"            // Single-byte character
+        u8"\u00e9"       // 2-byte character
+        u8")"            // Single-byte character
+        u8"\U0001f4a9"   // 4-byte character
+        u8"\U000020ac";  // 3-byte character
+
+    // This string has five code points, so it should meet minimum length 5 but not minimum
+    // length 6.
+    ASSERT_TRUE(matchingMinLength.matchesBSON(BSON("a" << testString)));
+    ASSERT_FALSE(nonMatchingMinLength.matchesBSON(BSON("a" << testString)));
+}
+
+TEST(InternalSchemaMinLengthMatchExpression, DealsWithInvalidUTF8) {
+    InternalSchemaMinLengthMatchExpression minLength;
+
+    ASSERT_OK(minLength.init("a", 1));
+
+    // Several kinds of invalid byte sequences listed in the Wikipedia article about UTF-8:
+    // https://en.wikipedia.org/wiki/UTF-8
+    constexpr auto testStringUnexpectedContinuationByte = "\bf";
+    constexpr auto testStringOverlongEncoding = "\xf0\x82\x82\xac";
+    constexpr auto testStringInvalidCodePoint = "\xed\xa0\x80";  // U+d800 is not allowed
+    constexpr auto testStringLeadingByteWithoutContinuationByte = "\xdf";
+
+    // Because these inputs are invalid, we don't have any expectations about the answers we get.
+    // Our only requirement is that the test does not crash.
+    std::ignore = minLength.matchesBSON(BSON("a" << testStringUnexpectedContinuationByte));
+    std::ignore = minLength.matchesBSON(BSON("a" << testStringOverlongEncoding));
+    std::ignore = minLength.matchesBSON(BSON("a" << testStringInvalidCodePoint));
+    std::ignore = minLength.matchesBSON(BSON("a" << testStringLeadingByteWithoutContinuationByte));
+}
+
 TEST(InternalSchemaMinLengthMatchExpression, NestedFieldsWorkWithDottedPaths) {
     InternalSchemaMinLengthMatchExpression minLength;
     ASSERT_OK(minLength.init("a.b", 2));
diff --git a/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h b/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h
index 5ff2ff7f52d..f83c6d584dc 100644
--- a/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h
+++ b/src/mongo/db/matcher/schema/expression_internal_schema_str_length.h
@@ -56,11 +56,8 @@ public:
             return false;
         }
 
-        // BSONElement::valuestrsize() includes the terminating null character, so subtract 1 before
-        // comparing its length.
-        int lenWithoutNullTerm = elem.valuestrsize() - 1;
-
-        return getComparator()(lenWithoutNullTerm);
+        auto len = str::lengthInUTF8CodePoints(elem.valueStringData());
+        return getComparator()(len);
     };
 
     void debugString(StringBuilder& debug, int level) const final;
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index d1bdaf6c983..c5877f5eee1 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -171,15 +171,6 @@ intrusive_ptr<Expression> Expression::parseOperand(
 namespace {
 /**
  * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
- * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
- * continuation byte.
- */
-bool isContinuationByte(char charByte) {
-    return (charByte & 0xc0) == 0x80;
-}
-
-/**
- * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
  * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading
  * byte.
  */
@@ -2692,8 +2683,9 @@ Value ExpressionIndexOfCP::evaluate(const Document& root) const {
             startByteIndex = byteIx;
         }
 
-        uassert(
-            40095, "$indexOfCP found bad UTF-8 in the input", !isContinuationByte(input[byteIx]));
+        uassert(40095,
+                "$indexOfCP found bad UTF-8 in the input",
+                !str::isUTF8ContinuationByte(input[byteIx]));
         byteIx += getCodePointLength(input[byteIx]);
     }
 
@@ -3882,7 +3874,7 @@ Value ExpressionSubstrBytes::evaluate(const Document& root) const {
     uassert(28656,
             str::stream() << getOpName()
                           << ":  Invalid range, starting index is a UTF-8 continuation byte.",
-            (lower >= str.length() || !isContinuationByte(str[lower])));
+            (lower >= str.length() || !str::isUTF8ContinuationByte(str[lower])));
 
     // Check the byte after the last character we'd return. If it is a continuation byte, that
     // means we're in the middle of a UTF-8 character.
@@ -3890,7 +3882,7 @@ Value ExpressionSubstrBytes::evaluate(const Document& root) const {
         28657,
         str::stream() << getOpName()
                       << ":  Invalid range, ending index is in the middle of a UTF-8 character.",
-        (lower + length >= str.length() || !isContinuationByte(str[lower + length])));
+        (lower + length >= str.length() || !str::isUTF8ContinuationByte(str[lower + length])));
 
     if (lower >= str.length()) {
         // If lower > str.length() then string::substr() will throw out_of_range, so return an
@@ -3955,7 +3947,7 @@ Value ExpressionSubstrCP::evaluate(const Document& root) const {
         }
         uassert(34456,
                 str::stream() << getOpName() << ": invalid UTF-8 string",
-                !isContinuationByte(str[startIndexBytes]));
+                !str::isUTF8ContinuationByte(str[startIndexBytes]));
         size_t codePointLength = getCodePointLength(str[startIndexBytes]);
         uassert(
             34457, str::stream() << getOpName() << ": invalid UTF-8 string", codePointLength <= 4);
@@ -3967,7 +3959,7 @@ Value ExpressionSubstrCP::evaluate(const Document& root) const {
     for (int i = 0; i < length && endIndexBytes < str.size(); i++) {
         uassert(34458,
                 str::stream() << getOpName() << ": invalid UTF-8 string",
-                !isContinuationByte(str[endIndexBytes]));
+                !str::isUTF8ContinuationByte(str[endIndexBytes]));
         size_t codePointLength = getCodePointLength(str[endIndexBytes]);
         uassert(
             34459, str::stream() << getOpName() << ": invalid UTF-8 string", codePointLength <= 4);
@@ -4016,11 +4008,7 @@ Value ExpressionStrLenCP::evaluate(const Document& root) const {
             val.getType() == String);
 
     std::string stringVal = val.getString();
-
-    size_t strLen = 0;
-    for (char byte : stringVal) {
-        strLen += !isContinuationByte(byte);
-    }
+    size_t strLen = str::lengthInUTF8CodePoints(stringVal);
 
     uassert(34472,
             "string length could not be represented as an int.",
diff --git a/src/mongo/util/mongoutils/str.h b/src/mongo/util/mongoutils/str.h
index c1967003506..d85ddcb2abf 100644
--- a/src/mongo/util/mongoutils/str.h
+++ b/src/mongo/util/mongoutils/str.h
@@ -239,6 +239,28 @@ inline std::string ltrim(const std::string& s) {
     return p;
 }
 
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
+ * continuation byte.
+ */
+inline bool isUTF8ContinuationByte(char charByte) {
+    return (charByte & 0xc0) == 0x80;
+}
+
+/**
+ * Assuming 'str' stores a UTF-8 string, returns the number of UTF codepoints. The return value is
+ * undefined if the input is not a well formed UTF-8 string.
+ */
+inline size_t lengthInUTF8CodePoints(mongo::StringData str) {
+    size_t strLen = 0;
+    for (char byte : str) {
+        strLen += !isUTF8ContinuationByte(byte);
+    }
+
+    return strLen;
+}
+
 }  // namespace str
 
 }  // namespace mongoutils
author	Justin Seyster <justin.seyster@mongodb.com>	2017-08-18 15:28:13 -0400
committer	Justin Seyster <justin.seyster@mongodb.com>	2017-08-18 19:13:27 -0400
commit	83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e (patch)
tree	ee81aa216726dfe9ab21fa7cf3b44e695653b82b
parent	aef10829fc71cb41c54df5838e9e7e74d41d122b (diff)
download	mongo-83e1ff54a1ef7fed5050e01ee9672f7dcf460e3e.tar.gz