summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenjamin Murphy <benjamin_murphy@me.com>2016-03-17 15:21:48 -0400
committerBenjamin Murphy <benjamin_murphy@me.com>2016-03-25 10:37:50 -0400
commit5afa97da4ce5049ef7eb8bf4717ce37bd6777754 (patch)
tree94f9cf0e1581f914ac26c8907a93030dd07c570d
parent10b650835a78fbf7ebe9cf26926cc45556efd178 (diff)
downloadmongo-5afa97da4ce5049ef7eb8bf4717ce37bd6777754.tar.gz
SERVER-22580 Aggregation now supports substrCP.
-rw-r--r--jstests/aggregation/bugs/server22580.js43
-rw-r--r--jstests/aggregation/bugs/server6556.js9
-rw-r--r--jstests/aggregation/bugs/substr.js39
-rw-r--r--jstests/aggregation/testall.js2
-rw-r--r--src/mongo/db/pipeline/expression.cpp129
-rw-r--r--src/mongo/db/pipeline/expression.h9
-rw-r--r--src/mongo/db/pipeline/expression_test.cpp68
7 files changed, 261 insertions, 38 deletions
diff --git a/jstests/aggregation/bugs/server22580.js b/jstests/aggregation/bugs/server22580.js
new file mode 100644
index 00000000000..afbfdd00dcd
--- /dev/null
+++ b/jstests/aggregation/bugs/server22580.js
@@ -0,0 +1,43 @@
+// In SERVER-22580, the $substrCP expression was introduced. In this file, we test the error cases
+// of this expression.
+load("jstests/aggregation/extras/utils.js"); // For assertErrorCode.
+
+(function() {
+ "use strict";
+
+ var coll = db.substrCP;
+ coll.drop();
+
+ // Need an empty document for pipeline.
+ coll.insert({});
+
+ assertErrorCode(coll,
+ [{$project: {substr: {$substrCP: ["abc", 0, "a"]}}}],
+ 34452,
+ "$substrCP" + " does not accept non-numeric types as a length.");
+
+ assertErrorCode(coll,
+ [{$project: {substr: {$substrCP: ["abc", 0, NaN]}}}],
+ 34453,
+ "$substrCP" + " does not accept non-integers as a length.");
+
+ assertErrorCode(coll,
+ [{$project: {substr: {$substrCP: ["abc", "abc", 3]}}}],
+ 34450,
+ "$substrCP does not accept non-numeric types as a starting index.");
+
+ assertErrorCode(coll,
+ [{$project: {substr: {$substrCP: ["abc", 2.2, 3]}}}],
+ 34451,
+ "$substrCP" + " does not accept non-integers as a starting index.");
+
+ assertErrorCode(coll,
+ [{$project: {substr: {$substrCP: ["abc", -1, 3]}}}],
+ 34455,
+ "$substrCP " + "does not accept negative integers as inputs.");
+
+ assertErrorCode(coll,
+ [{$project: {substr: {$substrCP: ["abc", 1, -3]}}}],
+ 34454,
+ "$substrCP " + "does not accept negative integers as inputs.");
+}());
diff --git a/jstests/aggregation/bugs/server6556.js b/jstests/aggregation/bugs/server6556.js
index 721b9fd5a98..636bef6b02c 100644
--- a/jstests/aggregation/bugs/server6556.js
+++ b/jstests/aggregation/bugs/server6556.js
@@ -9,8 +9,9 @@ c.save({foo: "as\0df"});
assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", "$foo"]}}}).toArray(),
[{matches: true}]);
// compare with the substring containing only the up to the null, they should not match
-assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", {$substr: ["$foo", 0, 3]}]}}})
- .toArray(),
+assert.eq(c.aggregate({
+ $project: {_id: 0, matches: {$eq: ["as\0df", {$substrBytes: ["$foo", 0, 3]}]}}
+}).toArray(),
[{matches: false}]);
// partial the other way shouldnt work either
assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as", "$foo"]}}}).toArray(),
@@ -19,6 +20,4 @@ assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as", "$foo"]}}}).toAr
assert.eq(c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0de", "$foo"]}}}).toArray(),
[{matches: false}]);
// should assert on fieldpaths with a null
-assert.throws(function() {
- c.aggregate({$project: {_id: 0, matches: {$eq: ["as\0df", "$f\0oo"]}}});
-});
+assert.throws(c.aggregate, {$project: {_id: 0, matches: {$eq: ["as\0df", "$f\0oo"]}}});
diff --git a/jstests/aggregation/bugs/substr.js b/jstests/aggregation/bugs/substr.js
index 4aee531dd67..9b514eb4679 100644
--- a/jstests/aggregation/bugs/substr.js
+++ b/jstests/aggregation/bugs/substr.js
@@ -1,4 +1,4 @@
-// Aggregation $substr tests.
+// Aggregation $substrBytes tests.
t = db.jstests_aggregation_substr;
t.drop();
@@ -6,11 +6,12 @@ t.drop();
t.save({});
function assertSubstring(expected, str, offset, len) {
- assert.eq(expected, t.aggregate({$project: {a: {$substr: [str, offset, len]}}}).toArray()[0].a);
+ assert.eq(expected,
+ t.aggregate({$project: {a: {$substrBytes: [str, offset, len]}}}).toArray()[0].a);
}
function assertArgsException(args) {
- assert.commandFailed(t.runCommand('aggregate', {pipeline: [{$substr: args}]}));
+ assert.commandFailed(t.runCommand('aggregate', {pipeline: [{$substrBytes: args}]}));
}
function assertException(str, offset, len) {
@@ -104,17 +105,21 @@ assertSubstring('cde', '$z', {$add: ['$b', '$b']}, {$add: ['$c', '$d']});
assertSubstring('cde', '$z', {$add: ['$b', 1]}, {$add: [2, '$d']});
// Nested.
-assert.eq('e',
- t.aggregate({
- $project: {
- a: {
- $substr: [
- {$substr: [{$substr: [{$substr: ['abcdefghij', 1, 6]}, 2, 5]}, 0, 3]},
- 1,
- 1
- ]
- }
- }
- })
- .toArray()[0]
- .a);
+assert.eq(
+ 'e',
+ t.aggregate({
+ $project: {
+ a: {
+ $substrBytes: [
+ {
+ $substrBytes:
+ [{$substrBytes: [{$substrBytes: ['abcdefghij', 1, 6]}, 2, 5]}, 0, 3]
+ },
+ 1,
+ 1
+ ]
+ }
+ }
+ })
+ .toArray()[0]
+ .a);
diff --git a/jstests/aggregation/testall.js b/jstests/aggregation/testall.js
index 11f2936abfc..5771cbefaef 100644
--- a/jstests/aggregation/testall.js
+++ b/jstests/aggregation/testall.js
@@ -474,7 +474,7 @@ var p17 = db.runCommand({
aggregate: "article",
pipeline: [{
$project: {
- author: {$substr: ["$author", 1, 2]},
+ author: {$substrBytes: ["$author", 1, 2]},
}
}]
});
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index c14ce934a43..a7ffd9f86b0 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -41,6 +41,7 @@
#include "mongo/db/pipeline/value.h"
#include "mongo/util/string_map.h"
#include "mongo/util/mongoutils/str.h"
+#include "mongo/platform/bits.h"
namespace mongo {
using Parser = Expression::Parser;
@@ -2989,9 +2990,48 @@ const char* ExpressionStrcasecmp::getOpName() const {
return "$strcasecmp";
}
-/* ----------------------- ExpressionSubstr ---------------------------- */
+namespace {
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
+ * continuation byte.
+ */
+bool isContinuationByte(char charByte) {
+ return (charByte & 0xc0) == 0x80;
+}
+
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading
+ * byte.
+ */
+bool isLeadingByte(char charByte) {
+ return (charByte & 0xc0) == 0xc0;
+}
-Value ExpressionSubstr::evaluateInternal(Variables* vars) const {
+/**
+ * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is
+ * a single-byte code point.
+ */
+bool isSingleByte(char charByte) {
+ return (charByte & 0x80) == 0x0;
+}
+
+size_t getCodePointLength(char charByte) {
+ if (isSingleByte(charByte)) {
+ return 1;
+ }
+
+ invariant(isLeadingByte(charByte));
+
+ // In UTF-8, the number of leading ones is the number of bytes the code point takes up.
+ return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8)));
+}
+} // namespace
+
+/* ----------------------- ExpressionSubstrBytes ---------------------------- */
+
+Value ExpressionSubstrBytes::evaluateInternal(Variables* vars) const {
Value pString(vpOperand[0]->evaluateInternal(vars));
Value pLower(vpOperand[1]->evaluateInternal(vars));
Value pLength(vpOperand[2]->evaluateInternal(vars));
@@ -3012,8 +3052,6 @@ Value ExpressionSubstr::evaluateInternal(Variables* vars) const {
string::size_type lower = static_cast<string::size_type>(pLower.coerceToLong());
string::size_type length = static_cast<string::size_type>(pLength.coerceToLong());
- auto isContinuationByte = [](char c) { return ((c & 0xc0) == 0x80); };
-
uassert(28656,
str::stream() << getOpName()
<< ": Invalid range, starting index is a UTF-8 continuation byte.",
@@ -3035,9 +3073,86 @@ Value ExpressionSubstr::evaluateInternal(Variables* vars) const {
return Value(str.substr(lower, length));
}
-REGISTER_EXPRESSION(substr, ExpressionSubstr::parse);
-const char* ExpressionSubstr::getOpName() const {
- return "$substr";
+// $substr is deprecated in favor of $substrBytes, but for now will just parse into a $substrBytes.
+REGISTER_EXPRESSION(substrBytes, ExpressionSubstrBytes::parse);
+REGISTER_EXPRESSION(substr, ExpressionSubstrBytes::parse);
+const char* ExpressionSubstrBytes::getOpName() const {
+ return "$substrBytes";
+}
+
+/* ----------------------- ExpressionSubstrCP ---------------------------- */
+
+Value ExpressionSubstrCP::evaluateInternal(Variables* vars) const {
+ Value inputVal(vpOperand[0]->evaluateInternal(vars));
+ Value lowerVal(vpOperand[1]->evaluateInternal(vars));
+ Value lengthVal(vpOperand[2]->evaluateInternal(vars));
+
+ std::string str = inputVal.coerceToString();
+ uassert(34450,
+ str::stream() << getOpName() << ": starting index must be a numeric type (is BSON type "
+ << typeName(lowerVal.getType()) << ")",
+ lowerVal.numeric());
+ uassert(34451,
+ str::stream() << getOpName()
+ << ": starting index cannot be represented as a 32-bit integral value: "
+ << lowerVal.toString(),
+ lowerVal.integral());
+ uassert(34452,
+ str::stream() << getOpName() << ": length must be a numeric type (is BSON type "
+ << typeName(lengthVal.getType()) << ")",
+ lengthVal.numeric());
+ uassert(34453,
+ str::stream() << getOpName()
+ << ": length cannot be represented as a 32-bit integral value: "
+ << lengthVal.toString(),
+ lengthVal.integral());
+
+ int startIndexCodePoints = lowerVal.coerceToInt();
+ int length = lengthVal.coerceToInt();
+
+ uassert(34454,
+ str::stream() << getOpName() << ": length must be a nonnegative integer.",
+ length >= 0);
+
+ uassert(34455,
+ str::stream() << getOpName() << ": the starting index must be nonnegative integer.",
+ startIndexCodePoints >= 0);
+
+ size_t startIndexBytes = 0;
+
+ for (int i = 0; i < startIndexCodePoints; i++) {
+ if (startIndexBytes >= str.size()) {
+ return Value("");
+ }
+ uassert(34456,
+ str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+ !isContinuationByte(str[startIndexBytes]));
+ size_t codePointLength = getCodePointLength(str[startIndexBytes]);
+ uassert(34457,
+ str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+ codePointLength <= 4);
+ startIndexBytes += codePointLength;
+ }
+
+ size_t endIndexBytes = startIndexBytes;
+
+ for (int i = 0; i < length && endIndexBytes < str.size(); i++) {
+ uassert(34458,
+ str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+ !isContinuationByte(str[endIndexBytes]));
+ size_t codePointLength = getCodePointLength(str[endIndexBytes]);
+ uassert(34459,
+ str::stream() << getOpName() << ": invalid UTF-8 string: " << str,
+ codePointLength <= 4);
+ endIndexBytes += codePointLength;
+ }
+
+ return Value(std::string(str, startIndexBytes, endIndexBytes - startIndexBytes));
+}
+
+REGISTER_EXPRESSION(substrCP, ExpressionSubstrCP::parse);
+const char* ExpressionSubstrCP::getOpName() const {
+ return "$substrCP";
}
/* ----------------------- ExpressionSubtract ---------------------------- */
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
index cf52497e252..71fcfec44ff 100644
--- a/src/mongo/db/pipeline/expression.h
+++ b/src/mongo/db/pipeline/expression.h
@@ -1245,7 +1245,14 @@ public:
};
-class ExpressionSubstr final : public ExpressionFixedArity<ExpressionSubstr, 3> {
+class ExpressionSubstrBytes : public ExpressionFixedArity<ExpressionSubstrBytes, 3> {
+public:
+ Value evaluateInternal(Variables* vars) const final;
+ const char* getOpName() const;
+};
+
+
+class ExpressionSubstrCP final : public ExpressionFixedArity<ExpressionSubstrCP, 3> {
public:
Value evaluateInternal(Variables* vars) const final;
const char* getOpName() const final;
diff --git a/src/mongo/db/pipeline/expression_test.cpp b/src/mongo/db/pipeline/expression_test.cpp
index 29c3e463690..10c70250ffa 100644
--- a/src/mongo/db/pipeline/expression_test.cpp
+++ b/src/mongo/db/pipeline/expression_test.cpp
@@ -4294,7 +4294,7 @@ class NullMiddleGt : public ExpectedResultBase {
} // namespace Strcasecmp
-namespace Substr {
+namespace SubstrBytes {
class ExpectedResultBase {
public:
@@ -4317,7 +4317,7 @@ protected:
private:
BSONObj spec() {
- return BSON("$substr" << BSON_ARRAY(str() << offset() << length()));
+ return BSON("$substrBytes" << BSON_ARRAY(str() << offset() << length()));
}
};
@@ -4403,6 +4403,60 @@ class DropEndingNull : public ExpectedResultBase {
} // namespace Substr
+namespace SubstrCP {
+
+TEST(ExpressionSubstrCPTest, DoesThrowWithBadContinuationByte) {
+ VariablesIdGenerator idGenerator;
+ VariablesParseState vps(&idGenerator);
+
+ char continuationByte = 0x80;
+ const auto expr = Expression::parseExpression(
+ BSON("$substrCP" << BSON_ARRAY(&continuationByte << 0 << 1)).firstElement(), vps);
+ ASSERT_THROWS({ expr->evaluate(Document()); }, UserException);
+}
+
+TEST(ExpressionSubstrCPTest, DoesThrowWithInvalidLeadingByte) {
+ VariablesIdGenerator idGenerator;
+ VariablesParseState vps(&idGenerator);
+
+ char leadingByte = 0xFF;
+ const auto expr = Expression::parseExpression(
+ BSON("$substrCP" << BSON_ARRAY(&leadingByte << 0 << 1)).firstElement(), vps);
+ ASSERT_THROWS({ expr->evaluate(Document()); }, UserException);
+}
+
+TEST(ExpressionSubstrCPTest, WithStandardValue) {
+ assertExpectedResults("$substrCP", {{{Value("abc"), Value(0), Value(2)}, Value("ab")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithNullCharacter) {
+ assertExpectedResults("$substrCP", {{{Value("abc\0d"), Value(2), Value(3)}, Value("c\0d")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithNullCharacterAtEnd) {
+ assertExpectedResults("$substrCP", {{{Value("abc\0"), Value(2), Value(2)}, Value("c\0")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithOutOfRangeString) {
+ assertExpectedResults("$substrCP", {{{Value("abc"), Value(3), Value(2)}, Value("")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithPartiallyOutOfRangeString) {
+ assertExpectedResults("$substrCP", {{{Value("abc"), Value(1), Value(4)}, Value("bc")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithUnicodeValue) {
+ assertExpectedResults("$substrCP", {{{Value("øø∫å"), Value(0), Value(4)}, Value("øø∫å")}});
+ assertExpectedResults("$substrBytes", {{{Value("øø∫å"), Value(0), Value(4)}, Value("øø")}});
+}
+
+TEST(ExpressionSubstrCPTest, WithMixedUnicodeAndASCIIValue) {
+ assertExpectedResults("$substrCP", {{{Value("a∫bøßabc"), Value(1), Value(4)}, Value("∫bøß")}});
+ assertExpectedResults("$substrBytes", {{{Value("a∫bøßabc"), Value(1), Value(4)}, Value("∫b")}});
+}
+
+} // namespace SubstrCP
+
namespace Type {
TEST(ExpressionTypeTest, WithMinKeyValue) {
@@ -4963,11 +5017,11 @@ public:
add<Strcasecmp::NullMiddleEq>();
add<Strcasecmp::NullMiddleGt>();
- add<Substr::FullNull>();
- add<Substr::BeginAtNull>();
- add<Substr::EndAtNull>();
- add<Substr::DropBeginningNull>();
- add<Substr::DropEndingNull>();
+ add<SubstrBytes::FullNull>();
+ add<SubstrBytes::BeginAtNull>();
+ add<SubstrBytes::EndAtNull>();
+ add<SubstrBytes::DropBeginningNull>();
+ add<SubstrBytes::DropEndingNull>();
add<ToLower::NullBegin>();
add<ToLower::NullMiddle>();