diff options
author | Benjamin Murphy <benjamin_murphy@me.com> | 2016-03-17 15:21:48 -0400 |
---|---|---|
committer | Benjamin Murphy <benjamin_murphy@me.com> | 2016-03-25 10:37:50 -0400 |
commit | 5afa97da4ce5049ef7eb8bf4717ce37bd6777754 (patch) | |
tree | 94f9cf0e1581f914ac26c8907a93030dd07c570d | |
parent | 10b650835a78fbf7ebe9cf26926cc45556efd178 (diff) | |
download | mongo-5afa97da4ce5049ef7eb8bf4717ce37bd6777754.tar.gz |
SERVER-22580 Aggregation now supports substrCP.
-rw-r--r-- | jstests/aggregation/bugs/server22580.js | 43 | ||||
-rw-r--r-- | jstests/aggregation/bugs/server6556.js | 9 | ||||
-rw-r--r-- | jstests/aggregation/bugs/substr.js | 39 | ||||
-rw-r--r-- | jstests/aggregation/testall.js | 2 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 129 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.h | 9 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression_test.cpp | 68 |
7 files changed, 261 insertions, 38 deletions
diff --git a/jstests/aggregation/bugs/server22580.js b/jstests/aggregation/bugs/server22580.js new file mode 100644 index 00000000000..afbfdd00dcd --- /dev/null +++ b/jstests/aggregation/bugs/server22580.js @@ -0,0 +1,43 @@ +// In SERVER-22580, the $substrCP expression was introduced. In this file, we test the error cases +// of this expression. +load("jstests/aggregation/extras/utils.js"); // For assertErrorCode. + +(function() { + "use strict"; + + var coll = db.substrCP; + coll.drop(); + + // Need an empty document for pipeline. + coll.insert({}); + + assertErrorCode(coll, + [{$project: {substr: {$substrCP: ["abc", 0, "a"]}}}], + 34452, + "$substrCP" + " does not accept non-numeric types as a length."); + + assertErrorCode(coll, + [{$project: {substr: {$substrCP: ["abc", 0, NaN]}}}], + 34453, + "$substrCP" + " does not accept non-integers as a length."); + + assertErrorCode(coll, + [{$project: {substr: {$substrCP: ["abc", "abc", 3]}}}], + 34450, + "$substrCP does not accept non-numeric types as a starting index."); + + assertErrorCode(coll, + [{$project: {substr: {$substrCP: ["abc", 2.2, 3]}}}], + 34451, + "$substrCP" + " does not accept non-integers as a starting index."); + + assertErrorCode(coll, + [{$project: {substr: {$substrCP: ["abc", -1, 3]}}}], + 34455, + "$substrCP " + "does not accept negative integers as inputs."); + + assertErrorCode(coll, + [{$project: {substr: {$substrCP: ["abc", 1, -3]}}}], + 34454, + "$substrCP " + "does not accept negative integers as inputs."); +}()); diff --git a/jstests/aggregation/bugs/server6556.js b/jstests/aggregation/bugs/server6556.js index 721b9fd5a98..636bef6b02c 100644 --- a/jstests/aggregation/bugs/server6556.js +++ b/jstests/aggregation/bugs/server6556.js @@ -9,8 +9,9 @@ c.save({foo: "as\0df"}); assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", "$foo"]}}}).toArray(), [{matches: true}]); // compare with the substring containing only the up to the null, they should not match -assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", {$substr: ["$foo", 0, 3]}]}}}) - .toArray(), +assert.eq(c.aggregate({ + $project: {_id: 0, matches: {$eq: ["as\0df", {$substrBytes: ["$foo", 0, 3]}]}} +}).toArray(), [{matches: false}]); // partial the other way shouldnt work either assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as", "$foo"]}}}).toArray(), @@ -19,6 +20,4 @@ assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as", "$foo"]}}}).toAr assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0de", "$foo"]}}}).toArray(), [{matches: false}]); // should assert on fieldpaths with a null -assert.throws(function() { - c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", "$f\0oo"]}}}); -}); +assert.throws(c.aggregate, {$project: {_id: 0, matches: {$eq: ["as\0df", "$f\0oo"]}}}); diff --git a/jstests/aggregation/bugs/substr.js b/jstests/aggregation/bugs/substr.js index 4aee531dd67..9b514eb4679 100644 --- a/jstests/aggregation/bugs/substr.js +++ b/jstests/aggregation/bugs/substr.js @@ -1,4 +1,4 @@ -// Aggregation $substr tests. +// Aggregation $substrBytes tests. t = db.jstests_aggregation_substr; t.drop(); @@ -6,11 +6,12 @@ t.drop(); t.save({}); function assertSubstring(expected, str, offset, len) { - assert.eq(expected, t.aggregate({$project: {a: {$substr: [str, offset, len]}}}).toArray()[0].a); + assert.eq(expected, + t.aggregate({$project: {a: {$substrBytes: [str, offset, len]}}}).toArray()[0].a); } function assertArgsException(args) { - assert.commandFailed(t.runCommand('aggregate', {pipeline: [{$substr: args}]})); + assert.commandFailed(t.runCommand('aggregate', {pipeline: [{$substrBytes: args}]})); } function assertException(str, offset, len) { @@ -104,17 +105,21 @@ assertSubstring('cde', '$z', {$add: ['$b', '$b']}, {$add: ['$c', '$d']}); assertSubstring('cde', '$z', {$add: ['$b', 1]}, {$add: [2, '$d']}); // Nested. -assert.eq('e', - t.aggregate({ - $project: { - a: { - $substr: [ - {$substr: [{$substr: [{$substr: ['abcdefghij', 1, 6]}, 2, 5]}, 0, 3]}, - 1, - 1 - ] - } - } - }) - .toArray()[0] - .a); +assert.eq( + 'e', + t.aggregate({ + $project: { + a: { + $substrBytes: [ + { + $substrBytes: + [{$substrBytes: [{$substrBytes: ['abcdefghij', 1, 6]}, 2, 5]}, 0, 3] + }, + 1, + 1 + ] + } + } + }) + .toArray()[0] + .a); diff --git a/jstests/aggregation/testall.js b/jstests/aggregation/testall.js index 11f2936abfc..5771cbefaef 100644 --- a/jstests/aggregation/testall.js +++ b/jstests/aggregation/testall.js @@ -474,7 +474,7 @@ var p17 = db.runCommand({ aggregate: "article", pipeline: [{ $project: { - author: {$substr: ["$author", 1, 2]}, + author: {$substrBytes: ["$author", 1, 2]}, } }] }); diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index c14ce934a43..a7ffd9f86b0 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -41,6 +41,7 @@ #include "mongo/db/pipeline/value.h" #include "mongo/util/string_map.h" #include "mongo/util/mongoutils/str.h" +#include "mongo/platform/bits.h" namespace mongo { using Parser = Expression::Parser; @@ -2989,9 +2990,48 @@ const char* ExpressionStrcasecmp::getOpName() const { return "$strcasecmp"; } -/* ----------------------- ExpressionSubstr ---------------------------- */ +namespace { +/** + * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially + * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a + * continuation byte. + */ +bool isContinuationByte(char charByte) { + return (charByte & 0xc0) == 0x80; +} + +/** + * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially + * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading + * byte. + */ +bool isLeadingByte(char charByte) { + return (charByte & 0xc0) == 0xc0; +} -Value ExpressionSubstr::evaluateInternal(Variables* vars) const { +/** + * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is + * a single-byte code point. + */ +bool isSingleByte(char charByte) { + return (charByte & 0x80) == 0x0; +} + +size_t getCodePointLength(char charByte) { + if (isSingleByte(charByte)) { + return 1; + } + + invariant(isLeadingByte(charByte)); + + // In UTF-8, the number of leading ones is the number of bytes the code point takes up. + return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8))); +} +} // namespace + +/* ----------------------- ExpressionSubstrBytes ---------------------------- */ + +Value ExpressionSubstrBytes::evaluateInternal(Variables* vars) const { Value pString(vpOperand[0]->evaluateInternal(vars)); Value pLower(vpOperand[1]->evaluateInternal(vars)); Value pLength(vpOperand[2]->evaluateInternal(vars)); @@ -3012,8 +3052,6 @@ Value ExpressionSubstr::evaluateInternal(Variables* vars) const { string::size_type lower = static_cast<string::size_type>(pLower.coerceToLong()); string::size_type length = static_cast<string::size_type>(pLength.coerceToLong()); - auto isContinuationByte = [](char c) { return ((c & 0xc0) == 0x80); }; - uassert(28656, str::stream() << getOpName() << ": Invalid range, starting index is a UTF-8 continuation byte.", @@ -3035,9 +3073,86 @@ Value ExpressionSubstr::evaluateInternal(Variables* vars) const { return Value(str.substr(lower, length)); } -REGISTER_EXPRESSION(substr, ExpressionSubstr::parse); -const char* ExpressionSubstr::getOpName() const { - return "$substr"; +// $substr is deprecated in favor of $substrBytes, but for now will just parse into a $substrBytes. +REGISTER_EXPRESSION(substrBytes, ExpressionSubstrBytes::parse); +REGISTER_EXPRESSION(substr, ExpressionSubstrBytes::parse); +const char* ExpressionSubstrBytes::getOpName() const { + return "$substrBytes"; +} + +/* ----------------------- ExpressionSubstrCP ---------------------------- */ + +Value ExpressionSubstrCP::evaluateInternal(Variables* vars) const { + Value inputVal(vpOperand[0]->evaluateInternal(vars)); + Value lowerVal(vpOperand[1]->evaluateInternal(vars)); + Value lengthVal(vpOperand[2]->evaluateInternal(vars)); + + std::string str = inputVal.coerceToString(); + uassert(34450, + str::stream() << getOpName() << ": starting index must be a numeric type (is BSON type " + << typeName(lowerVal.getType()) << ")", + lowerVal.numeric()); + uassert(34451, + str::stream() << getOpName() + << ": starting index cannot be represented as a 32-bit integral value: " + << lowerVal.toString(), + lowerVal.integral()); + uassert(34452, + str::stream() << getOpName() << ": length must be a numeric type (is BSON type " + << typeName(lengthVal.getType()) << ")", + lengthVal.numeric()); + uassert(34453, + str::stream() << getOpName() + << ": length cannot be represented as a 32-bit integral value: " + << lengthVal.toString(), + lengthVal.integral()); + + int startIndexCodePoints = lowerVal.coerceToInt(); + int length = lengthVal.coerceToInt(); + + uassert(34454, + str::stream() << getOpName() << ": length must be a nonnegative integer.", + length >= 0); + + uassert(34455, + str::stream() << getOpName() << ": the starting index must be nonnegative integer.", + startIndexCodePoints >= 0); + + size_t startIndexBytes = 0; + + for (int i = 0; i < startIndexCodePoints; i++) { + if (startIndexBytes >= str.size()) { + return Value(""); + } + uassert(34456, + str::stream() << getOpName() << ": invalid UTF-8 string: " << str, + !isContinuationByte(str[startIndexBytes])); + size_t codePointLength = getCodePointLength(str[startIndexBytes]); + uassert(34457, + str::stream() << getOpName() << ": invalid UTF-8 string: " << str, + codePointLength <= 4); + startIndexBytes += codePointLength; + } + + size_t endIndexBytes = startIndexBytes; + + for (int i = 0; i < length && endIndexBytes < str.size(); i++) { + uassert(34458, + str::stream() << getOpName() << ": invalid UTF-8 string: " << str, + !isContinuationByte(str[endIndexBytes])); + size_t codePointLength = getCodePointLength(str[endIndexBytes]); + uassert(34459, + str::stream() << getOpName() << ": invalid UTF-8 string: " << str, + codePointLength <= 4); + endIndexBytes += codePointLength; + } + + return Value(std::string(str, startIndexBytes, endIndexBytes - startIndexBytes)); +} + +REGISTER_EXPRESSION(substrCP, ExpressionSubstrCP::parse); +const char* ExpressionSubstrCP::getOpName() const { + return "$substrCP"; } /* ----------------------- ExpressionSubtract ---------------------------- */ diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h index cf52497e252..71fcfec44ff 100644 --- a/src/mongo/db/pipeline/expression.h +++ b/src/mongo/db/pipeline/expression.h @@ -1245,7 +1245,14 @@ public: }; -class ExpressionSubstr final : public ExpressionFixedArity<ExpressionSubstr, 3> { +class ExpressionSubstrBytes : public ExpressionFixedArity<ExpressionSubstrBytes, 3> { +public: + Value evaluateInternal(Variables* vars) const final; + const char* getOpName() const; +}; + + +class ExpressionSubstrCP final : public ExpressionFixedArity<ExpressionSubstrCP, 3> { public: Value evaluateInternal(Variables* vars) const final; const char* getOpName() const final; diff --git a/src/mongo/db/pipeline/expression_test.cpp b/src/mongo/db/pipeline/expression_test.cpp index 29c3e463690..10c70250ffa 100644 --- a/src/mongo/db/pipeline/expression_test.cpp +++ b/src/mongo/db/pipeline/expression_test.cpp @@ -4294,7 +4294,7 @@ class NullMiddleGt : public ExpectedResultBase { } // namespace Strcasecmp -namespace Substr { +namespace SubstrBytes { class ExpectedResultBase { public: @@ -4317,7 +4317,7 @@ protected: private: BSONObj spec() { - return BSON("$substr" << BSON_ARRAY(str() << offset() << length())); + return BSON("$substrBytes" << BSON_ARRAY(str() << offset() << length())); } }; @@ -4403,6 +4403,60 @@ class DropEndingNull : public ExpectedResultBase { } // namespace Substr +namespace SubstrCP { + +TEST(ExpressionSubstrCPTest, DoesThrowWithBadContinuationByte) { + VariablesIdGenerator idGenerator; + VariablesParseState vps(&idGenerator); + + char continuationByte = 0x80; + const auto expr = Expression::parseExpression( + BSON("$substrCP" << BSON_ARRAY(&continuationByte << 0 << 1)).firstElement(), vps); + ASSERT_THROWS({ expr->evaluate(Document()); }, UserException); +} + +TEST(ExpressionSubstrCPTest, DoesThrowWithInvalidLeadingByte) { + VariablesIdGenerator idGenerator; + VariablesParseState vps(&idGenerator); + + char leadingByte = 0xFF; + const auto expr = Expression::parseExpression( + BSON("$substrCP" << BSON_ARRAY(&leadingByte << 0 << 1)).firstElement(), vps); + ASSERT_THROWS({ expr->evaluate(Document()); }, UserException); +} + +TEST(ExpressionSubstrCPTest, WithStandardValue) { + assertExpectedResults("$substrCP", {{{Value("abc"), Value(0), Value(2)}, Value("ab")}}); +} + +TEST(ExpressionSubstrCPTest, WithNullCharacter) { + assertExpectedResults("$substrCP", {{{Value("abc\0d"), Value(2), Value(3)}, Value("c\0d")}}); +} + +TEST(ExpressionSubstrCPTest, WithNullCharacterAtEnd) { + assertExpectedResults("$substrCP", {{{Value("abc\0"), Value(2), Value(2)}, Value("c\0")}}); +} + +TEST(ExpressionSubstrCPTest, WithOutOfRangeString) { + assertExpectedResults("$substrCP", {{{Value("abc"), Value(3), Value(2)}, Value("")}}); +} + +TEST(ExpressionSubstrCPTest, WithPartiallyOutOfRangeString) { + assertExpectedResults("$substrCP", {{{Value("abc"), Value(1), Value(4)}, Value("bc")}}); +} + +TEST(ExpressionSubstrCPTest, WithUnicodeValue) { + assertExpectedResults("$substrCP", {{{Value("øø∫å"), Value(0), Value(4)}, Value("øø∫å")}}); + assertExpectedResults("$substrBytes", {{{Value("øø∫å"), Value(0), Value(4)}, Value("øø")}}); +} + +TEST(ExpressionSubstrCPTest, WithMixedUnicodeAndASCIIValue) { + assertExpectedResults("$substrCP", {{{Value("a∫bøßabc"), Value(1), Value(4)}, Value("∫bøß")}}); + assertExpectedResults("$substrBytes", {{{Value("a∫bøßabc"), Value(1), Value(4)}, Value("∫b")}}); +} + +} // namespace SubstrCP + namespace Type { TEST(ExpressionTypeTest, WithMinKeyValue) { @@ -4963,11 +5017,11 @@ public: add<Strcasecmp::NullMiddleEq>(); add<Strcasecmp::NullMiddleGt>(); - add<Substr::FullNull>(); - add<Substr::BeginAtNull>(); - add<Substr::EndAtNull>(); - add<Substr::DropBeginningNull>(); - add<Substr::DropEndingNull>(); + add<SubstrBytes::FullNull>(); + add<SubstrBytes::BeginAtNull>(); + add<SubstrBytes::EndAtNull>(); + add<SubstrBytes::DropBeginningNull>(); + add<SubstrBytes::DropEndingNull>(); add<ToLower::NullBegin>(); add<ToLower::NullMiddle>(); |