SERVER-22580 Aggregation now supports substrCP.

author: Benjamin Murphy <benjamin_murphy@me.com> 2016-03-17 15:21:48 -0400
committer: Benjamin Murphy <benjamin_murphy@me.com> 2016-03-25 10:37:50 -0400
commit: 5afa97da4ce5049ef7eb8bf4717ce37bd6777754 (patch)
tree: 94f9cf0e1581f914ac26c8907a93030dd07c570d
parent: 10b650835a78fbf7ebe9cf26926cc45556efd178 (diff)
download: mongo-5afa97da4ce5049ef7eb8bf4717ce37bd6777754.tar.gz
7 files changed, 261 insertions, 38 deletions
diff --git a/jstests/aggregation/bugs/server22580.js b/jstests/aggregation/bugs/server22580.js
new file mode 100644
index 00000000000..afbfdd00dcd
--- /dev/null
+++ b/jstests/aggregation/bugs/server22580.js
@@ -0,0 +1,43 @@
+// In SERVER-22580, the $substrCP expression was introduced. In this file, we test the error cases
+// of this expression.
+load("jstests/aggregation/extras/utils.js");  // For assertErrorCode.
+
+(function() {
+    "use strict";
+
+    var coll = db.substrCP;
+    coll.drop();
+
+    // Need an empty document for pipeline.
+    coll.insert({});
+
+    assertErrorCode(coll,
+                        [{$project: {substr: {$substrCP: ["abc", 0, "a"]}}}],
+                    34452,
+                    "$substrCP" + " does not accept non-numeric types as a length.");
+
+    assertErrorCode(coll,
+                        [{$project: {substr: {$substrCP: ["abc", 0, NaN]}}}],
+                    34453,
+                    "$substrCP" + " does not accept non-integers as a length.");
+
+    assertErrorCode(coll,
+                        [{$project: {substr: {$substrCP: ["abc", "abc", 3]}}}],
+                    34450,
+                    "$substrCP does not accept non-numeric types as a starting index.");
+
+    assertErrorCode(coll,
+                        [{$project: {substr: {$substrCP: ["abc", 2.2, 3]}}}],
+                    34451,
+                    "$substrCP" + " does not accept non-integers as a starting index.");
+
+    assertErrorCode(coll,
+                        [{$project: {substr: {$substrCP: ["abc", -1, 3]}}}],
+                    34455,
+                    "$substrCP " + "does not accept negative integers as inputs.");
+
+    assertErrorCode(coll,
+                        [{$project: {substr: {$substrCP: ["abc", 1, -3]}}}],
+                    34454,
+                    "$substrCP " + "does not accept negative integers as inputs.");
+}());
diff --git a/jstests/aggregation/bugs/server6556.js b/jstests/aggregation/bugs/server6556.js
index 721b9fd5a98..636bef6b02c 100644
--- a/jstests/aggregation/bugs/server6556.js
+++ b/jstests/aggregation/bugs/server6556.js
@@ -9,8 +9,9 @@ c.save({foo: "as\0df"});
 assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", "$foo"]}}}).toArray(),
               [{matches: true}]);
 // compare with the substring containing only the up to the null, they should not match
-assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", {$substr: ["$foo", 0, 3]}]}}})
-              .toArray(),
+assert.eq(c.aggregate({
+    $project: {_id: 0, matches: {$eq: ["as\0df", {$substrBytes: ["$foo", 0, 3]}]}}
+}).toArray(),
               [{matches: false}]);
 // partial the other way shouldnt work either
 assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as", "$foo"]}}}).toArray(),
@@ -19,6 +20,4 @@ assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as", "$foo"]}}}).toAr
 assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0de", "$foo"]}}}).toArray(),
               [{matches: false}]);
 // should assert on fieldpaths with a null
-assert.throws(function() {
-    c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", "$f\0oo"]}}});
-});
+assert.throws(c.aggregate, {$project: {_id: 0, matches: {$eq: ["as\0df", "$f\0oo"]}}});
diff --git a/jstests/aggregation/bugs/substr.js b/jstests/aggregation/bugs/substr.js
index 4aee531dd67..9b514eb4679 100644
--- a/jstests/aggregation/bugs/substr.js
+++ b/jstests/aggregation/bugs/substr.js
@@ -1,4 +1,4 @@
-// Aggregation $substr tests.
+// Aggregation $substrBytes tests.
 
 t = db.jstests_aggregation_substr;
 t.drop();
@@ -6,11 +6,12 @@ t.drop();
 t.save({});
 
 function assertSubstring(expected, str, offset, len) {
-    assert.eq(expected, t.aggregate({$project: {a: {$substr: [str, offset, len]}}}).toArray()[0].a);
+    assert.eq(expected,
+              t.aggregate({$project: {a: {$substrBytes: [str, offset, len]}}}).toArray()[0].a);
 }
 
 function assertArgsException(args) {
-    assert.commandFailed(t.runCommand('aggregate', {pipeline: [{$substr: args}]}));
+    assert.commandFailed(t.runCommand('aggregate', {pipeline: [{$substrBytes: args}]}));
 }
 
 function assertException(str, offset, len) {
@@ -104,17 +105,21 @@ assertSubstring('cde', '$z', {$add: ['$b', '$b']}, {$add: ['$c', '$d']});
 assertSubstring('cde', '$z', {$add: ['$b', 1]}, {$add: [2, '$d']});
 
 // Nested.
-assert.eq('e',
-          t.aggregate({
-              $project: {
-                  a: {
-                      $substr: [
-                          {$substr: [{$substr: [{$substr: ['abcdefghij', 1, 6]}, 2, 5]}, 0, 3]},
-                          1,
-                          1
-                      ]
-                  }
-              }
-          })
-              .toArray()[0]
-              .a);
+assert.eq(
+    'e',
+    t.aggregate({
+        $project: {
+            a: {
+                $substrBytes: [
+                    {
+                      $substrBytes:
+                          [{$substrBytes: [{$substrBytes: ['abcdefghij', 1, 6]}, 2, 5]}, 0, 3]
+                    },
+                    1,
+                    1
+                ]
+            }
+        }
+    })
+        .toArray()[0]
+        .a);
diff --git a/jstests/aggregation/testall.js b/jstests/aggregation/testall.js
index 11f2936abfc..5771cbefaef 100644
--- a/jstests/aggregation/testall.js
+++ b/jstests/aggregation/testall.js
@@ -474,7 +474,7 @@ var p17 = db.runCommand({
     aggregate: "article",
     pipeline: [{
         $project: {
-            author: {$substr: ["$author", 1, 2]},
+            author: {$substrBytes: ["$author", 1, 2]},
         }
     }]
 });
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index c14ce934a43..a7ffd9f86b0 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -41,6 +41,7 @@
 #include "mongo/db/pipeline/value.h"
 #include "mongo/util/string_map.h"
 #include "mongo/util/mongoutils/str.h"
+#include "mongo/platform/bits.h"
 
 namespace mongo {
 using Parser = Expression::Parser;
@@ -2989,9 +2990,48 @@ const char* ExpressionStrcasecmp::getOpName() const {
     return "$strcasecmp";
 }
 
-/* ----------------------- ExpressionSubstr ---------------------------- */
+namespace {
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
+ * continuation byte.
+ */
+bool isContinuationByte(char charByte) {
+    return (charByte & 0xc0) == 0x80;
+}
+
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading
+ * byte.
+ */
+bool isLeadingByte(char charByte) {
+    return (charByte & 0xc0) == 0xc0;
+}
 
-Value ExpressionSubstr::evaluateInternal(Variables* vars) const {
+/**
+ * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is
+ * a single-byte code point.
+ */
+bool isSingleByte(char charByte) {
+    return (charByte & 0x80) == 0x0;
+}
+
+size_t getCodePointLength(char charByte) {
+    if (isSingleByte(charByte)) {
+        return 1;
+    }
+
+    invariant(isLeadingByte(charByte));
+
+    // In UTF-8, the number of leading ones is the number of bytes the code point takes up.
+    return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8)));
+}
+}  // namespace
+
+/* ----------------------- ExpressionSubstrBytes ---------------------------- */
+
+Value ExpressionSubstrBytes::evaluateInternal(Variables* vars) const {
     Value pString(vpOperand[0]->evaluateInternal(vars));
     Value pLower(vpOperand[1]->evaluateInternal(vars));
     Value pLength(vpOperand[2]->evaluateInternal(vars));
@@ -3012,8 +3052,6 @@ Value ExpressionSubstr::evaluateInternal(Variables* vars) const {
     string::size_type lower = static_cast<string::size_type>(pLower.coerceToLong());
     string::size_type length = static_cast<string::size_type>(pLength.coerceToLong());
 
-    auto isContinuationByte = [](char c) { return ((c & 0xc0) == 0x80); };
-
     uassert(28656,
             str::stream() << getOpName()
                           << ":  Invalid range, starting index is a UTF-8 continuation byte.",
@@ -3035,9 +3073,86 @@ Value ExpressionSubstr::evaluateInternal(Variables* vars) const {
     return Value(str.substr(lower, length));
 }
 
-REGISTER_EXPRESSION(substr, ExpressionSubstr::parse);
-const char* ExpressionSubstr::getOpName() const {
-    return "$substr";
+// $substr is deprecated in favor of $substrBytes, but for now will just parse into a $substrBytes.
+REGISTER_EXPRESSION(substrBytes, ExpressionSubstrBytes::parse);
+REGISTER_EXPRESSION(substr, ExpressionSubstrBytes::parse);
+const char* ExpressionSubstrBytes::getOpName() const {
+    return "$substrBytes";
+}
+
+/* ----------------------- ExpressionSubstrCP ---------------------------- */
+
+Value ExpressionSubstrCP::evaluateInternal(Variables* vars) const {
+    Value inputVal(vpOperand[0]->evaluateInternal(vars));
+    Value lowerVal(vpOperand[1]->evaluateInternal(vars));
+    Value lengthVal(vpOperand[2]->evaluateInternal(vars));
+
+    std::string str = inputVal.coerceToString();
+    uassert(34450,
+            str::stream() << getOpName() << ": starting index must be a numeric type (is BSON type "
+                          << typeName(lowerVal.getType()) << ")",
+            lowerVal.numeric());
+    uassert(34451,
+            str::stream() << getOpName()
+                          << ": starting index cannot be represented as a 32-bit integral value: "
+                          << lowerVal.toString(),
+            lowerVal.integral());
+    uassert(34452,
+            str::stream() << getOpName() << ": length must be a numeric type (is BSON type "
+                          << typeName(lengthVal.getType()) << ")",
+            lengthVal.numeric());
+    uassert(34453,
+            str::stream() << getOpName()
+                          << ": length cannot be represented as a 32-bit integral value: "
+                          << lengthVal.toString(),
+            lengthVal.integral());
+
+    int startIndexCodePoints = lowerVal.coerceToInt();
+    int length = lengthVal.coerceToInt();
+
+    uassert(34454,
+            str::stream() << getOpName() << ": length must be a nonnegative integer.",
+            length >= 0);
+
+    uassert(34455,
+            str::stream() << getOpName() << ": the starting index must be nonnegative integer.",
+            startIndexCodePoints >= 0);
+
+    size_t startIndexBytes = 0;
+
+    for (int i = 0; i < startIndexCodePoints; i++) {
+        if (startIndexBytes >= str.size()) {
+            return Value("");
+        }
+        uassert(34456,
+                str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+                !isContinuationByte(str[startIndexBytes]));
+        size_t codePointLength = getCodePointLength(str[startIndexBytes]);
+        uassert(34457,
+                str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+                codePointLength <= 4);
+        startIndexBytes += codePointLength;
+    }
+
+    size_t endIndexBytes = startIndexBytes;
+
+    for (int i = 0; i < length && endIndexBytes < str.size(); i++) {
+        uassert(34458,
+                str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+                !isContinuationByte(str[endIndexBytes]));
+        size_t codePointLength = getCodePointLength(str[endIndexBytes]);
+        uassert(34459,
+                str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+                codePointLength <= 4);
+        endIndexBytes += codePointLength;
+    }
+
+    return Value(std::string(str, startIndexBytes, endIndexBytes - startIndexBytes));
+}
+
+REGISTER_EXPRESSION(substrCP, ExpressionSubstrCP::parse);
+const char* ExpressionSubstrCP::getOpName() const {
+    return "$substrCP";
 }
 
 /* ----------------------- ExpressionSubtract ---------------------------- */
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
index cf52497e252..71fcfec44ff 100644
--- a/src/mongo/db/pipeline/expression.h
+++ b/src/mongo/db/pipeline/expression.h
@@ -1245,7 +1245,14 @@ public:
 };
 
 
-class ExpressionSubstr final : public ExpressionFixedArity<ExpressionSubstr, 3> {
+class ExpressionSubstrBytes : public ExpressionFixedArity<ExpressionSubstrBytes, 3> {
+public:
+    Value evaluateInternal(Variables* vars) const final;
+    const char* getOpName() const;
+};
+
+
+class ExpressionSubstrCP final : public ExpressionFixedArity<ExpressionSubstrCP, 3> {
 public:
     Value evaluateInternal(Variables* vars) const final;
     const char* getOpName() const final;
diff --git a/src/mongo/db/pipeline/expression_test.cpp b/src/mongo/db/pipeline/expression_test.cpp
index 29c3e463690..10c70250ffa 100644
--- a/src/mongo/db/pipeline/expression_test.cpp
+++ b/src/mongo/db/pipeline/expression_test.cpp
@@ -4294,7 +4294,7 @@ class NullMiddleGt : public ExpectedResultBase {
 
 }  // namespace Strcasecmp
 
-namespace Substr {
+namespace SubstrBytes {
 
 class ExpectedResultBase {
 public:
@@ -4317,7 +4317,7 @@ protected:
 
 private:
     BSONObj spec() {
-        return BSON("$substr" << BSON_ARRAY(str() << offset() << length()));
+        return BSON("$substrBytes" << BSON_ARRAY(str() << offset() << length()));
     }
 };
 
@@ -4403,6 +4403,60 @@ class DropEndingNull : public ExpectedResultBase {
 
 }  // namespace Substr
 
+namespace SubstrCP {
+
+TEST(ExpressionSubstrCPTest, DoesThrowWithBadContinuationByte) {
+    VariablesIdGenerator idGenerator;
+    VariablesParseState vps(&idGenerator);
+
+    char continuationByte = 0x80;
+    const auto expr = Expression::parseExpression(
+        BSON("$substrCP" << BSON_ARRAY(&continuationByte << 0 << 1)).firstElement(), vps);
+    ASSERT_THROWS({ expr->evaluate(Document()); }, UserException);
+}
+
+TEST(ExpressionSubstrCPTest, DoesThrowWithInvalidLeadingByte) {
+    VariablesIdGenerator idGenerator;
+    VariablesParseState vps(&idGenerator);
+
+    char leadingByte = 0xFF;
+    const auto expr = Expression::parseExpression(
+        BSON("$substrCP" << BSON_ARRAY(&leadingByte << 0 << 1)).firstElement(), vps);
+    ASSERT_THROWS({ expr->evaluate(Document()); }, UserException);
+}
+
+TEST(ExpressionSubstrCPTest, WithStandardValue) {
+    assertExpectedResults("$substrCP", {{{Value("abc"), Value(0), Value(2)}, Value("ab")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithNullCharacter) {
+    assertExpectedResults("$substrCP", {{{Value("abc\0d"), Value(2), Value(3)}, Value("c\0d")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithNullCharacterAtEnd) {
+    assertExpectedResults("$substrCP", {{{Value("abc\0"), Value(2), Value(2)}, Value("c\0")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithOutOfRangeString) {
+    assertExpectedResults("$substrCP", {{{Value("abc"), Value(3), Value(2)}, Value("")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithPartiallyOutOfRangeString) {
+    assertExpectedResults("$substrCP", {{{Value("abc"), Value(1), Value(4)}, Value("bc")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithUnicodeValue) {
+    assertExpectedResults("$substrCP", {{{Value("øø∫å"), Value(0), Value(4)}, Value("øø∫å")}});
+    assertExpectedResults("$substrBytes", {{{Value("øø∫å"), Value(0), Value(4)}, Value("øø")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithMixedUnicodeAndASCIIValue) {
+    assertExpectedResults("$substrCP", {{{Value("a∫bøßabc"), Value(1), Value(4)}, Value("∫bøß")}});
+    assertExpectedResults("$substrBytes", {{{Value("a∫bøßabc"), Value(1), Value(4)}, Value("∫b")}});
+}
+
+}  // namespace SubstrCP
+
 namespace Type {
 
 TEST(ExpressionTypeTest, WithMinKeyValue) {
@@ -4963,11 +5017,11 @@ public:
         add<Strcasecmp::NullMiddleEq>();
         add<Strcasecmp::NullMiddleGt>();
 
-        add<Substr::FullNull>();
-        add<Substr::BeginAtNull>();
-        add<Substr::EndAtNull>();
-        add<Substr::DropBeginningNull>();
-        add<Substr::DropEndingNull>();
+        add<SubstrBytes::FullNull>();
+        add<SubstrBytes::BeginAtNull>();
+        add<SubstrBytes::EndAtNull>();
+        add<SubstrBytes::DropBeginningNull>();
+        add<SubstrBytes::DropEndingNull>();
 
         add<ToLower::NullBegin>();
         add<ToLower::NullMiddle>();
author	Benjamin Murphy <benjamin_murphy@me.com>	2016-03-17 15:21:48 -0400
committer	Benjamin Murphy <benjamin_murphy@me.com>	2016-03-25 10:37:50 -0400
commit	5afa97da4ce5049ef7eb8bf4717ce37bd6777754 (patch)
tree	94f9cf0e1581f914ac26c8907a93030dd07c570d
parent	10b650835a78fbf7ebe9cf26926cc45556efd178 (diff)
download	mongo-5afa97da4ce5049ef7eb8bf4717ce37bd6777754.tar.gz