summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jstests/aggregation/expressions/indexof_array.js68
-rw-r--r--jstests/aggregation/expressions/indexof_bytes.js149
-rw-r--r--jstests/aggregation/expressions/indexof_codepoints.js129
-rw-r--r--jstests/aggregation/extras/utils.js4
-rw-r--r--src/mongo/db/pipeline/expression.cpp299
-rw-r--r--src/mongo/db/pipeline/expression.h24
6 files changed, 620 insertions, 53 deletions
diff --git a/jstests/aggregation/expressions/indexof_array.js b/jstests/aggregation/expressions/indexof_array.js
new file mode 100644
index 00000000000..bfc9ef71a15
--- /dev/null
+++ b/jstests/aggregation/expressions/indexof_array.js
@@ -0,0 +1,68 @@
+// In SERVER-8951, $indexOfArray was introduced. In this file, we test the correctness and error
+// cases of the expression.
+load("jstests/aggregation/extras/utils.js"); // For assertErrorCode and testExpression.
+
+(function() {
+ "use strict";
+
+ var coll = db.indexofarray;
+ coll.drop();
+
+ // Insert a dummy document to ensure something flows through the pipeline.
+ assert.writeOK(coll.insert({}));
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2]}, 1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 4]}, -1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3, 2, 1], 2, 2]}, 3);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3, 4, 5], 4, 0, 3]}, -1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 1]}, 1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 0, 10]}, 1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3, 2, 1, 2, 3], 2, 2, 4]}, 3);
+
+ testExpression(coll, {$indexOfArray: [null, 2]}, null);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3]}, -1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3, 1]}, -1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3, 3]}, -1);
+
+ testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3, 5]}, -1);
+
+ testExpression(coll, {$indexOfArray: [[], 1]}, -1);
+
+ var pipeline = {
+ $project: {
+ output: {
+ $indexOfArray: ["string", "s"],
+ }
+ }
+ };
+ assertErrorCode(coll, pipeline, 40090);
+
+ pipeline = {
+ $project: {output: {$indexOfArray: [[1, 2, 3], 2, "bad"]}}
+ };
+ assertErrorCode(coll, pipeline, 40096);
+
+ pipeline = {
+ $project: {output: {$indexOfArray: [[1, 2, 3], 2, 0, "bad"]}}
+ };
+ assertErrorCode(coll, pipeline, 40096);
+
+ pipeline = {
+ $project: {output: {$indexOfArray: [[1, 2, 3], 2, -1]}}
+ };
+ assertErrorCode(coll, pipeline, 40097);
+
+ pipeline = {
+ $project: {output: {$indexOfArray: [[1, 2, 3], 2, 1, -1]}}
+ };
+ assertErrorCode(coll, pipeline, 40097);
+}());
diff --git a/jstests/aggregation/expressions/indexof_bytes.js b/jstests/aggregation/expressions/indexof_bytes.js
new file mode 100644
index 00000000000..ac3cefda790
--- /dev/null
+++ b/jstests/aggregation/expressions/indexof_bytes.js
@@ -0,0 +1,149 @@
+// In SERVER-8951, $indexOfBytes was introduced. In this file, we test the correctness and error
+// cases of the expression.
+load("jstests/aggregation/extras/utils.js"); // For assertErrorCode and testExpression.
+
+(function() {
+ "use strict";
+
+ function testExpressionBytes(coll, expression, result, shouldTestEquivalence = true) {
+ testExpression(coll, expression, result);
+
+ if (shouldTestEquivalence) {
+ // If we are specifying a starting or ending index for the search, we should be able to
+ // achieve equivalent behavior using $substrBytes.
+ var indexOfSpec = expression["$indexOfBytes"];
+ var input = indexOfSpec[0];
+ var token = indexOfSpec[1];
+ var start = indexOfSpec.length > 2 ? indexOfSpec[2] : 0;
+ // Use $strLenBytes because JavaScript's length property is based off of UTF-16, not the
+ // actual number of bytes.
+ var end = indexOfSpec.length > 3 ? indexOfSpec[3] : {
+ $strLenBytes: input
+ };
+
+ var substrExpr = {
+ $indexOfBytes: [{$substrBytes: [input, start, {$subtract: [end, start]}]}, token]
+ };
+
+ // Since the new expression takes the index with respect to a shortened string, the
+ // output index will differ from the index with respect to the full length string,
+ // unless the output is -1.
+ var substrResult = (result === -1) ? -1 : result - start;
+
+ testExpression(coll, substrExpr, substrResult);
+ }
+ }
+
+ var coll = db.indexofbytes;
+ coll.drop();
+
+ // Insert a dummy document so something flows through the pipeline.
+ assert.writeOK(coll.insert({}));
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "b"]}, 1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abcba", "b"]}, 1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "d"]}, -1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abcba", "b", 2]}, 3);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abcde", "d", 0, 2]}, -1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 1]}, 1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 0, 10]}, 1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abcbabc", "b", 2, 4]}, 3);
+
+ // $strLenBytes does not accept null as an input.
+ testExpressionBytes(coll, {$indexOfBytes: [null, "b"]}, null, false);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 3]}, -1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 3, 1]}, -1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 3, 5]}, -1);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["", " "]}, -1);
+
+ testExpressionBytes(coll, {$indexOfBytes: [" ", ""]}, 0);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["", ""]}, 0);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "", 3]}, 3);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc", "", 1]}, 1);
+
+ // Test with multi-byte tokens.
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abcde", "de"]}, 3);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abcde", "def"]}, -1);
+
+ // Test with non-ASCII characters. Some tests do not test equivalence using $substrBytes because
+ // $substrBytes disallows taking a substring that begins or ends in the middle of a UTF-8
+ // encoding of a character.
+ testExpressionBytes(coll, {$indexOfBytes: ["a∫∫b", "b"]}, 7);
+
+ // $substrBytes would attempt to take the substring from the middle of a UTF-8
+ // encoding of a character.
+ testExpressionBytes(coll, {$indexOfBytes: ["a∫∫b", "b", 6]}, 7, false);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc∫ba", "∫"]}, 3);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["∫∫∫", "a"]}, -1);
+
+ // $substrBytes would attempt to take the substring from the middle of a UTF-8
+ // encoding of a character.
+ testExpressionBytes(coll, {$indexOfBytes: ["ab∫c", "c", 0, 3]}, -1, false);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc∫b∫", "b∫"]}, 6);
+
+ // Test with embedded null bytes.
+ testExpressionBytes(coll, {$indexOfBytes: ["abc\0d", "d"]}, 4);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc\0", "\0"]}, 3);
+
+ testExpressionBytes(coll, {$indexOfBytes: ["abc\0d\0", "d", 5, 6]}, -1);
+
+ // Error cases.
+
+ var pipeline = {
+ $project: {
+ output: {
+ $indexOfBytes: [3, "s"],
+ }
+ }
+ };
+ assertErrorCode(coll, pipeline, 40091);
+
+ pipeline = {
+ $project: {
+ output: {
+ $indexOfBytes: ["s", 3],
+ }
+ }
+ };
+ assertErrorCode(coll, pipeline, 40092);
+
+ pipeline = {
+ $project: {output: {$indexOfBytes: ["abc", "b", "bad"]}}
+ };
+ assertErrorCode(coll, pipeline, 40096);
+
+ pipeline = {
+ $project: {output: {$indexOfBytes: ["abc", "b", 0, "bad"]}}
+ };
+ assertErrorCode(coll, pipeline, 40096);
+
+ pipeline = {
+ $project: {output: {$indexOfBytes: ["abc", "b", -1]}}
+ };
+ assertErrorCode(coll, pipeline, 40097);
+
+ pipeline = {
+ $project: {output: {$indexOfBytes: ["abc", "b", 1, -1]}}
+ };
+ assertErrorCode(coll, pipeline, 40097);
+}());
diff --git a/jstests/aggregation/expressions/indexof_codepoints.js b/jstests/aggregation/expressions/indexof_codepoints.js
new file mode 100644
index 00000000000..20b9534b050
--- /dev/null
+++ b/jstests/aggregation/expressions/indexof_codepoints.js
@@ -0,0 +1,129 @@
+// In SERVER-8951, $indexOfCP was introduced. In this file, we test the correctness and error
+// cases of the expression.
+load("jstests/aggregation/extras/utils.js"); // For assertErrorCode and testExpression.
+
+(function() {
+ "use strict";
+
+ function testExpressionCodePoints(coll, expression, result, shouldTestEquivalence = true) {
+ testExpression(coll, expression, result);
+
+ var indexOfSpec = expression["$indexOfCP"];
+ if (shouldTestEquivalence) {
+ // If we are specifying a starting or ending index for the search, we should be able to
+ // achieve equivalent behavior using $substrCP.
+ var input = indexOfSpec[0];
+ var token = indexOfSpec[1];
+ var start = indexOfSpec.length > 2 ? indexOfSpec[2] : 0;
+ var end = indexOfSpec.length > 3 ? indexOfSpec[3] : {
+ $strLenCP: input
+ };
+
+ var substrExpr = {
+ $indexOfCP: [{$substrCP: [input, start, {$subtract: [end, start]}]}, token]
+ };
+
+ // Since the new expression takes the index with respect to a shortened string, the
+ // output index will differ from the index with respect to the full length string,
+ // unless the output is -1.
+ var substrResult = (result === -1) ? -1 : result - start;
+
+ testExpression(coll, substrExpr, substrResult);
+ }
+ }
+
+ var coll = db.indexofcp;
+ coll.drop();
+
+ // Insert a dummy document so something flows through the pipeline.
+ assert.writeOK(coll.insert({}));
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["∫aƒ", "ƒ"]}, 2);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["a∫c", "d"]}, -1);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["∫b∫ba", "b", 2]}, 3);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["ab∫de", "d", 0, 3]}, -1);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["ab∫de", "d", 0, 4]}, 3);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["øøc", "ø", 1]}, 1);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["øƒc", "ƒ", 0, 10]}, 1);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["abcbabc", "b", 2, 4]}, 3);
+
+ // $strLenCP does not accept null as an input.
+ testExpressionCodePoints(coll, {$indexOfCP: [null, "√"]}, null, false);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["abc", "b", 3]}, -1);
+
+ // We are intentionally testing specifying an end index before the start index, which is why we
+ // cannot use $substrCP in checking for equivalence.
+ testExpressionCodePoints(coll, {$indexOfCP: ["a√cb", "b", 3, 1]}, -1, false);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["a∫b", "b", 3, 5]}, -1);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["", "∫"]}, -1);
+
+ testExpressionCodePoints(coll, {$indexOfCP: [" ", ""]}, 0);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["", ""]}, 0);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["abc", "", 1]}, 1);
+
+ // Test with multi-byte tokens.
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["abcƒe", "ƒe"]}, 3);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["∫aeøø", "øøø"]}, -1);
+
+ // Test with embedded null bytes.
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["ab∫\0d", "d"]}, 4);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["øbc\0", "\0"]}, 3);
+
+ testExpressionCodePoints(coll, {$indexOfCP: ["πbƒ\0d\0", "d", 5, 6]}, -1);
+
+ // Error cases.
+
+ var pipeline = {
+ $project: {
+ output: {
+ $indexOfCP: [3, "s"],
+ }
+ }
+ };
+ assertErrorCode(coll, pipeline, 40093);
+
+ pipeline = {
+ $project: {
+ output: {
+ $indexOfCP: ["s", 3],
+ }
+ }
+ };
+ assertErrorCode(coll, pipeline, 40094);
+
+ pipeline = {
+ $project: {output: {$indexOfCP: ["abc", "b", "bad"]}}
+ };
+ assertErrorCode(coll, pipeline, 40096);
+
+ pipeline = {
+ $project: {output: {$indexOfCP: ["abc", "b", 0, "bad"]}}
+ };
+ assertErrorCode(coll, pipeline, 40096);
+
+ pipeline = {
+ $project: {output: {$indexOfCP: ["abc", "b", -1]}}
+ };
+ assertErrorCode(coll, pipeline, 40097);
+
+ pipeline = {
+ $project: {output: {$indexOfCP: ["abc", "b", 1, -1]}}
+ };
+ assertErrorCode(coll, pipeline, 40097);
+}());
diff --git a/jstests/aggregation/extras/utils.js b/jstests/aggregation/extras/utils.js
index c1a8fc5645d..d8203f56b94 100644
--- a/jstests/aggregation/extras/utils.js
+++ b/jstests/aggregation/extras/utils.js
@@ -8,8 +8,8 @@ function testExpression(coll, expression, result) {
var res = coll.aggregate({$project: {output: expression}}).toArray();
- assert.eq(res.length, 1);
- assert.eq(res[0].output, result);
+ assert.eq(res.length, 1, tojson(res));
+ assert.eq(res[0].output, result, tojson(res));
}
/*
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index f462052d856..8024ec39cf2 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -359,6 +359,44 @@ intrusive_ptr<Expression> Expression::parseOperand(BSONElement exprElement,
}
}
+namespace {
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
+ * continuation byte.
+ */
+bool isContinuationByte(char charByte) {
+ return (charByte & 0xc0) == 0x80;
+}
+
+/**
+ * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
+ * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading
+ * byte.
+ */
+bool isLeadingByte(char charByte) {
+ return (charByte & 0xc0) == 0xc0;
+}
+
+/**
+ * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is
+ * a single-byte code point.
+ */
+bool isSingleByte(char charByte) {
+ return (charByte & 0x80) == 0x0;
+}
+
+size_t getCodePointLength(char charByte) {
+ if (isSingleByte(charByte)) {
+ return 1;
+ }
+
+ invariant(isLeadingByte(charByte));
+
+ // In UTF-8, the number of leading ones is the number of bytes the code point takes up.
+ return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8)));
+}
+} // namespace
/* ----------------------- ExpressionAbs ---------------------------- */
@@ -2118,6 +2156,216 @@ const char* ExpressionIn::getOpName() const {
return "$in";
}
+/* ----------------------- ExpressionIndexOfArray ------------------ */
+
+namespace {
+
+void uassertIfNotIntegralAndNonNegative(Value val,
+ StringData expressionName,
+ StringData argumentName) {
+ uassert(40096,
+ str::stream() << expressionName << "requires an integral " << argumentName
+ << ", found a value of type: " << typeName(val.getType())
+ << ", with value: " << val.toString(),
+ val.integral());
+ uassert(40097,
+ str::stream() << expressionName << " requires a nonnegative " << argumentName
+ << ", found: " << val.toString(),
+ val.coerceToInt() >= 0);
+}
+
+} // namespace
+
+Value ExpressionIndexOfArray::evaluateInternal(Variables* vars) const {
+ Value arrayArg = vpOperand[0]->evaluateInternal(vars);
+
+ if (arrayArg.nullish()) {
+ return Value(BSONNULL);
+ }
+
+ uassert(40090,
+ str::stream() << "$indexOfArray requires an array as a first argument, found: "
+ << typeName(arrayArg.getType()),
+ arrayArg.isArray());
+
+ std::vector<Value> array = arrayArg.getArray();
+
+ Value searchItem = vpOperand[1]->evaluateInternal(vars);
+
+ size_t startIndex = 0;
+ if (vpOperand.size() > 2) {
+ Value startIndexArg = vpOperand[2]->evaluateInternal(vars);
+ uassertIfNotIntegralAndNonNegative(startIndexArg, getOpName(), "starting index");
+ startIndex = static_cast<size_t>(startIndexArg.coerceToInt());
+ }
+
+ size_t endIndex = array.size();
+ if (vpOperand.size() > 3) {
+ Value endIndexArg = vpOperand[3]->evaluateInternal(vars);
+ uassertIfNotIntegralAndNonNegative(endIndexArg, getOpName(), "ending index");
+ // Don't let 'endIndex' exceed the length of the array.
+ endIndex = std::min(array.size(), static_cast<size_t>(endIndexArg.coerceToInt()));
+ }
+
+ for (size_t i = startIndex; i < endIndex; i++) {
+ if (array[i] == searchItem) {
+ return Value(static_cast<int>(i));
+ }
+ }
+
+ return Value(-1);
+}
+
+REGISTER_EXPRESSION(indexOfArray, ExpressionIndexOfArray::parse);
+const char* ExpressionIndexOfArray::getOpName() const {
+ return "$indexOfArray";
+}
+
+/* ----------------------- ExpressionIndexOfBytes ------------------ */
+
+namespace {
+
+bool stringHasTokenAtIndex(size_t index, const std::string& input, const std::string& token) {
+ if (token.size() + index > input.size()) {
+ return false;
+ }
+ return input.compare(index, token.size(), token) == 0;
+}
+
+} // namespace
+
+Value ExpressionIndexOfBytes::evaluateInternal(Variables* vars) const {
+ Value stringArg = vpOperand[0]->evaluateInternal(vars);
+
+ if (stringArg.nullish()) {
+ return Value(BSONNULL);
+ }
+
+ uassert(40091,
+ str::stream() << "$indexOfBytes requires a string as the first argument, found: "
+ << typeName(stringArg.getType()),
+ stringArg.getType() == String);
+ const std::string& input = stringArg.getString();
+
+ Value tokenArg = vpOperand[1]->evaluateInternal(vars);
+ uassert(40092,
+ str::stream() << "$indexOfBytes requires a string as the second argument, found: "
+ << typeName(tokenArg.getType()),
+ tokenArg.getType() == String);
+ const std::string& token = tokenArg.getString();
+
+ size_t startIndex = 0;
+ if (vpOperand.size() > 2) {
+ Value startIndexArg = vpOperand[2]->evaluateInternal(vars);
+ uassertIfNotIntegralAndNonNegative(startIndexArg, getOpName(), "starting index");
+ startIndex = static_cast<size_t>(startIndexArg.coerceToInt());
+ }
+
+ size_t endIndex = input.size();
+ if (vpOperand.size() > 3) {
+ Value endIndexArg = vpOperand[3]->evaluateInternal(vars);
+ uassertIfNotIntegralAndNonNegative(endIndexArg, getOpName(), "ending index");
+ // Don't let 'endIndex' exceed the length of the string.
+ endIndex = std::min(input.size(), static_cast<size_t>(endIndexArg.coerceToInt()));
+ }
+
+ if (startIndex > input.length() || endIndex < startIndex) {
+ return Value(-1);
+ }
+
+ size_t position = input.substr(0, endIndex).find(token, startIndex);
+ if (position == std::string::npos) {
+ return Value(-1);
+ }
+
+ return Value(static_cast<int>(position));
+}
+
+REGISTER_EXPRESSION(indexOfBytes, ExpressionIndexOfBytes::parse);
+const char* ExpressionIndexOfBytes::getOpName() const {
+ return "$indexOfBytes";
+}
+
+/* ----------------------- ExpressionIndexOfCP --------------------- */
+
+Value ExpressionIndexOfCP::evaluateInternal(Variables* vars) const {
+ Value stringArg = vpOperand[0]->evaluateInternal(vars);
+
+ if (stringArg.nullish()) {
+ return Value(BSONNULL);
+ }
+
+ uassert(40093,
+ str::stream() << "$indexOfCP requires a string as the first argument, found: "
+ << typeName(stringArg.getType()),
+ stringArg.getType() == String);
+ const std::string& input = stringArg.getString();
+
+ Value tokenArg = vpOperand[1]->evaluateInternal(vars);
+ uassert(40094,
+ str::stream() << "$indexOfCP requires a string as the second argument, found: "
+ << typeName(tokenArg.getType()),
+ tokenArg.getType() == String);
+ const std::string& token = tokenArg.getString();
+
+ size_t startCodePointIndex = 0;
+ if (vpOperand.size() > 2) {
+ Value startIndexArg = vpOperand[2]->evaluateInternal(vars);
+ uassertIfNotIntegralAndNonNegative(startIndexArg, getOpName(), "starting index");
+ startCodePointIndex = static_cast<size_t>(startIndexArg.coerceToInt());
+ }
+
+ // Compute the length (in code points) of the input, and convert 'startCodePointIndex' to a byte
+ // index.
+ size_t codePointLength = 0;
+ size_t startByteIndex = 0;
+ for (size_t byteIx = 0; byteIx < input.size(); ++codePointLength) {
+ if (codePointLength == startCodePointIndex) {
+ // We have determined the byte at which our search will start.
+ startByteIndex = byteIx;
+ }
+
+ uassert(
+ 40095, "$indexOfCP found bad UTF-8 in the input", !isContinuationByte(input[byteIx]));
+ byteIx += getCodePointLength(input[byteIx]);
+ }
+
+ size_t endCodePointIndex = codePointLength;
+ if (vpOperand.size() > 3) {
+ Value endIndexArg = vpOperand[3]->evaluateInternal(vars);
+ uassertIfNotIntegralAndNonNegative(endIndexArg, getOpName(), "ending index");
+
+ // Don't let 'endCodePointIndex' exceed the number of code points in the string.
+ endCodePointIndex =
+ std::min(codePointLength, static_cast<size_t>(endIndexArg.coerceToInt()));
+ }
+
+ if (startByteIndex == 0 && input.empty() && token.empty()) {
+ // If we are finding the index of "" in the string "", the below loop will not loop, so we
+ // need a special case for this.
+ return Value(0);
+ }
+
+ // We must keep track of which byte, and which code point, we are examining, being careful not
+ // to overflow either the length of the string or the ending code point.
+
+ size_t currentCodePointIndex = startCodePointIndex;
+ for (size_t byteIx = startByteIndex; currentCodePointIndex < endCodePointIndex;
+ ++currentCodePointIndex) {
+ if (stringHasTokenAtIndex(byteIx, input, token)) {
+ return Value(static_cast<int>(currentCodePointIndex));
+ }
+ byteIx += getCodePointLength(input[byteIx]);
+ }
+
+ return Value(-1);
+}
+
+REGISTER_EXPRESSION(indexOfCP, ExpressionIndexOfCP::parse);
+const char* ExpressionIndexOfCP::getOpName() const {
+ return "$indexOfCP";
+}
+
/* ----------------------- ExpressionLn ---------------------------- */
Value ExpressionLn::evaluateNumericArg(const Value& numericArg) const {
@@ -3071,18 +3319,6 @@ const char* ExpressionSize::getOpName() const {
/* ----------------------- ExpressionSplit --------------------------- */
-namespace {
-
-bool stringHasTokenAtIndex(size_t index, const std::string& input, const std::string& token) {
- if (token.size() + index > input.size()) {
- return false;
- }
-
- return input.compare(index, token.size(), token) == 0;
-}
-
-} // namespace
-
Value ExpressionSplit::evaluateInternal(Variables* vars) const {
Value inputArg = vpOperand[0]->evaluateInternal(vars);
Value separatorArg = vpOperand[1]->evaluateInternal(vars);
@@ -3174,45 +3410,6 @@ const char* ExpressionStrcasecmp::getOpName() const {
return "$strcasecmp";
}
-namespace {
-/**
- * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
- * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a
- * continuation byte.
- */
-bool isContinuationByte(char charByte) {
- return (charByte & 0xc0) == 0x80;
-}
-
-/**
- * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially
- * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading
- * byte.
- */
-bool isLeadingByte(char charByte) {
- return (charByte & 0xc0) == 0xc0;
-}
-
-/**
- * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is
- * a single-byte code point.
- */
-bool isSingleByte(char charByte) {
- return (charByte & 0x80) == 0x0;
-}
-
-size_t getCodePointLength(char charByte) {
- if (isSingleByte(charByte)) {
- return 1;
- }
-
- invariant(isLeadingByte(charByte));
-
- // In UTF-8, the number of leading ones is the number of bytes the code point takes up.
- return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8)));
-}
-} // namespace
-
/* ----------------------- ExpressionSubstrBytes ---------------------------- */
Value ExpressionSubstrBytes::evaluateInternal(Variables* vars) const {
diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h
index 777fbd4dc8c..87a9536e689 100644
--- a/src/mongo/db/pipeline/expression.h
+++ b/src/mongo/db/pipeline/expression.h
@@ -859,6 +859,30 @@ public:
};
+class ExpressionIndexOfArray final : public ExpressionRangedArity<ExpressionIndexOfArray, 2, 4> {
+public:
+ Value evaluateInternal(Variables* vars) const final;
+ const char* getOpName() const final;
+};
+
+
+class ExpressionIndexOfBytes final : public ExpressionRangedArity<ExpressionIndexOfBytes, 2, 4> {
+public:
+ Value evaluateInternal(Variables* vars) const final;
+ const char* getOpName() const final;
+};
+
+
+/**
+ * Implements indexOf behavior for strings with UTF-8 encoding.
+ */
+class ExpressionIndexOfCP final : public ExpressionRangedArity<ExpressionIndexOfCP, 2, 4> {
+public:
+ Value evaluateInternal(Variables* vars) const final;
+ const char* getOpName() const final;
+};
+
+
class ExpressionLet final : public Expression {
public:
boost::intrusive_ptr<Expression> optimize() final;