diff options
-rw-r--r-- | jstests/aggregation/expressions/indexof_array.js | 68 | ||||
-rw-r--r-- | jstests/aggregation/expressions/indexof_bytes.js | 149 | ||||
-rw-r--r-- | jstests/aggregation/expressions/indexof_codepoints.js | 129 | ||||
-rw-r--r-- | jstests/aggregation/extras/utils.js | 4 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 299 | ||||
-rw-r--r-- | src/mongo/db/pipeline/expression.h | 24 |
6 files changed, 620 insertions, 53 deletions
diff --git a/jstests/aggregation/expressions/indexof_array.js b/jstests/aggregation/expressions/indexof_array.js new file mode 100644 index 00000000000..bfc9ef71a15 --- /dev/null +++ b/jstests/aggregation/expressions/indexof_array.js @@ -0,0 +1,68 @@ +// In SERVER-8951, $indexOfArray was introduced. In this file, we test the correctness and error +// cases of the expression. +load("jstests/aggregation/extras/utils.js"); // For assertErrorCode and testExpression. + +(function() { + "use strict"; + + var coll = db.indexofarray; + coll.drop(); + + // Insert a dummy document to ensure something flows through the pipeline. + assert.writeOK(coll.insert({})); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2]}, 1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 4]}, -1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3, 2, 1], 2, 2]}, 3); + + testExpression(coll, {$indexOfArray: [[1, 2, 3, 4, 5], 4, 0, 3]}, -1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 1]}, 1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 0, 10]}, 1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3, 2, 1, 2, 3], 2, 2, 4]}, 3); + + testExpression(coll, {$indexOfArray: [null, 2]}, null); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3]}, -1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3, 1]}, -1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3, 3]}, -1); + + testExpression(coll, {$indexOfArray: [[1, 2, 3], 2, 3, 5]}, -1); + + testExpression(coll, {$indexOfArray: [[], 1]}, -1); + + var pipeline = { + $project: { + output: { + $indexOfArray: ["string", "s"], + } + } + }; + assertErrorCode(coll, pipeline, 40090); + + pipeline = { + $project: {output: {$indexOfArray: [[1, 2, 3], 2, "bad"]}} + }; + assertErrorCode(coll, pipeline, 40096); + + pipeline = { + $project: {output: {$indexOfArray: [[1, 2, 3], 2, 0, "bad"]}} + }; + assertErrorCode(coll, pipeline, 40096); + + pipeline = { + $project: {output: {$indexOfArray: [[1, 2, 3], 2, -1]}} + }; + assertErrorCode(coll, pipeline, 40097); + + pipeline = { + $project: {output: {$indexOfArray: [[1, 2, 3], 2, 1, -1]}} + }; + assertErrorCode(coll, pipeline, 40097); +}()); diff --git a/jstests/aggregation/expressions/indexof_bytes.js b/jstests/aggregation/expressions/indexof_bytes.js new file mode 100644 index 00000000000..ac3cefda790 --- /dev/null +++ b/jstests/aggregation/expressions/indexof_bytes.js @@ -0,0 +1,149 @@ +// In SERVER-8951, $indexOfBytes was introduced. In this file, we test the correctness and error +// cases of the expression. +load("jstests/aggregation/extras/utils.js"); // For assertErrorCode and testExpression. + +(function() { + "use strict"; + + function testExpressionBytes(coll, expression, result, shouldTestEquivalence = true) { + testExpression(coll, expression, result); + + if (shouldTestEquivalence) { + // If we are specifying a starting or ending index for the search, we should be able to + // achieve equivalent behavior using $substrBytes. + var indexOfSpec = expression["$indexOfBytes"]; + var input = indexOfSpec[0]; + var token = indexOfSpec[1]; + var start = indexOfSpec.length > 2 ? indexOfSpec[2] : 0; + // Use $strLenBytes because JavaScript's length property is based off of UTF-16, not the + // actual number of bytes. + var end = indexOfSpec.length > 3 ? indexOfSpec[3] : { + $strLenBytes: input + }; + + var substrExpr = { + $indexOfBytes: [{$substrBytes: [input, start, {$subtract: [end, start]}]}, token] + }; + + // Since the new expression takes the index with respect to a shortened string, the + // output index will differ from the index with respect to the full length string, + // unless the output is -1. + var substrResult = (result === -1) ? -1 : result - start; + + testExpression(coll, substrExpr, substrResult); + } + } + + var coll = db.indexofbytes; + coll.drop(); + + // Insert a dummy document so something flows through the pipeline. + assert.writeOK(coll.insert({})); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "b"]}, 1); + + testExpressionBytes(coll, {$indexOfBytes: ["abcba", "b"]}, 1); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "d"]}, -1); + + testExpressionBytes(coll, {$indexOfBytes: ["abcba", "b", 2]}, 3); + + testExpressionBytes(coll, {$indexOfBytes: ["abcde", "d", 0, 2]}, -1); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 1]}, 1); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 0, 10]}, 1); + + testExpressionBytes(coll, {$indexOfBytes: ["abcbabc", "b", 2, 4]}, 3); + + // $strLenBytes does not accept null as an input. + testExpressionBytes(coll, {$indexOfBytes: [null, "b"]}, null, false); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 3]}, -1); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 3, 1]}, -1); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "b", 3, 5]}, -1); + + testExpressionBytes(coll, {$indexOfBytes: ["", " "]}, -1); + + testExpressionBytes(coll, {$indexOfBytes: [" ", ""]}, 0); + + testExpressionBytes(coll, {$indexOfBytes: ["", ""]}, 0); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "", 3]}, 3); + + testExpressionBytes(coll, {$indexOfBytes: ["abc", "", 1]}, 1); + + // Test with multi-byte tokens. + + testExpressionBytes(coll, {$indexOfBytes: ["abcde", "de"]}, 3); + + testExpressionBytes(coll, {$indexOfBytes: ["abcde", "def"]}, -1); + + // Test with non-ASCII characters. Some tests do not test equivalence using $substrBytes because + // $substrBytes disallows taking a substring that begins or ends in the middle of a UTF-8 + // encoding of a character. + testExpressionBytes(coll, {$indexOfBytes: ["a∫∫b", "b"]}, 7); + + // $substrBytes would attempt to take the substring from the middle of a UTF-8 + // encoding of a character. + testExpressionBytes(coll, {$indexOfBytes: ["a∫∫b", "b", 6]}, 7, false); + + testExpressionBytes(coll, {$indexOfBytes: ["abc∫ba", "∫"]}, 3); + + testExpressionBytes(coll, {$indexOfBytes: ["∫∫∫", "a"]}, -1); + + // $substrBytes would attempt to take the substring from the middle of a UTF-8 + // encoding of a character. + testExpressionBytes(coll, {$indexOfBytes: ["ab∫c", "c", 0, 3]}, -1, false); + + testExpressionBytes(coll, {$indexOfBytes: ["abc∫b∫", "b∫"]}, 6); + + // Test with embedded null bytes. + testExpressionBytes(coll, {$indexOfBytes: ["abc\0d", "d"]}, 4); + + testExpressionBytes(coll, {$indexOfBytes: ["abc\0", "\0"]}, 3); + + testExpressionBytes(coll, {$indexOfBytes: ["abc\0d\0", "d", 5, 6]}, -1); + + // Error cases. + + var pipeline = { + $project: { + output: { + $indexOfBytes: [3, "s"], + } + } + }; + assertErrorCode(coll, pipeline, 40091); + + pipeline = { + $project: { + output: { + $indexOfBytes: ["s", 3], + } + } + }; + assertErrorCode(coll, pipeline, 40092); + + pipeline = { + $project: {output: {$indexOfBytes: ["abc", "b", "bad"]}} + }; + assertErrorCode(coll, pipeline, 40096); + + pipeline = { + $project: {output: {$indexOfBytes: ["abc", "b", 0, "bad"]}} + }; + assertErrorCode(coll, pipeline, 40096); + + pipeline = { + $project: {output: {$indexOfBytes: ["abc", "b", -1]}} + }; + assertErrorCode(coll, pipeline, 40097); + + pipeline = { + $project: {output: {$indexOfBytes: ["abc", "b", 1, -1]}} + }; + assertErrorCode(coll, pipeline, 40097); +}()); diff --git a/jstests/aggregation/expressions/indexof_codepoints.js b/jstests/aggregation/expressions/indexof_codepoints.js new file mode 100644 index 00000000000..20b9534b050 --- /dev/null +++ b/jstests/aggregation/expressions/indexof_codepoints.js @@ -0,0 +1,129 @@ +// In SERVER-8951, $indexOfCP was introduced. In this file, we test the correctness and error +// cases of the expression. +load("jstests/aggregation/extras/utils.js"); // For assertErrorCode and testExpression. + +(function() { + "use strict"; + + function testExpressionCodePoints(coll, expression, result, shouldTestEquivalence = true) { + testExpression(coll, expression, result); + + var indexOfSpec = expression["$indexOfCP"]; + if (shouldTestEquivalence) { + // If we are specifying a starting or ending index for the search, we should be able to + // achieve equivalent behavior using $substrCP. + var input = indexOfSpec[0]; + var token = indexOfSpec[1]; + var start = indexOfSpec.length > 2 ? indexOfSpec[2] : 0; + var end = indexOfSpec.length > 3 ? indexOfSpec[3] : { + $strLenCP: input + }; + + var substrExpr = { + $indexOfCP: [{$substrCP: [input, start, {$subtract: [end, start]}]}, token] + }; + + // Since the new expression takes the index with respect to a shortened string, the + // output index will differ from the index with respect to the full length string, + // unless the output is -1. + var substrResult = (result === -1) ? -1 : result - start; + + testExpression(coll, substrExpr, substrResult); + } + } + + var coll = db.indexofcp; + coll.drop(); + + // Insert a dummy document so something flows through the pipeline. + assert.writeOK(coll.insert({})); + + testExpressionCodePoints(coll, {$indexOfCP: ["∫aƒ", "ƒ"]}, 2); + + testExpressionCodePoints(coll, {$indexOfCP: ["a∫c", "d"]}, -1); + + testExpressionCodePoints(coll, {$indexOfCP: ["∫b∫ba", "b", 2]}, 3); + + testExpressionCodePoints(coll, {$indexOfCP: ["ab∫de", "d", 0, 3]}, -1); + + testExpressionCodePoints(coll, {$indexOfCP: ["ab∫de", "d", 0, 4]}, 3); + + testExpressionCodePoints(coll, {$indexOfCP: ["øøc", "ø", 1]}, 1); + + testExpressionCodePoints(coll, {$indexOfCP: ["øƒc", "ƒ", 0, 10]}, 1); + + testExpressionCodePoints(coll, {$indexOfCP: ["abcbabc", "b", 2, 4]}, 3); + + // $strLenCP does not accept null as an input. + testExpressionCodePoints(coll, {$indexOfCP: [null, "√"]}, null, false); + + testExpressionCodePoints(coll, {$indexOfCP: ["abc", "b", 3]}, -1); + + // We are intentionally testing specifying an end index before the start index, which is why we + // cannot use $substrCP in checking for equivalence. + testExpressionCodePoints(coll, {$indexOfCP: ["a√cb", "b", 3, 1]}, -1, false); + + testExpressionCodePoints(coll, {$indexOfCP: ["a∫b", "b", 3, 5]}, -1); + + testExpressionCodePoints(coll, {$indexOfCP: ["", "∫"]}, -1); + + testExpressionCodePoints(coll, {$indexOfCP: [" ", ""]}, 0); + + testExpressionCodePoints(coll, {$indexOfCP: ["", ""]}, 0); + + testExpressionCodePoints(coll, {$indexOfCP: ["abc", "", 1]}, 1); + + // Test with multi-byte tokens. + + testExpressionCodePoints(coll, {$indexOfCP: ["abcƒe", "ƒe"]}, 3); + + testExpressionCodePoints(coll, {$indexOfCP: ["∫aeøø", "øøø"]}, -1); + + // Test with embedded null bytes. + + testExpressionCodePoints(coll, {$indexOfCP: ["ab∫\0d", "d"]}, 4); + + testExpressionCodePoints(coll, {$indexOfCP: ["øbc\0", "\0"]}, 3); + + testExpressionCodePoints(coll, {$indexOfCP: ["πbƒ\0d\0", "d", 5, 6]}, -1); + + // Error cases. + + var pipeline = { + $project: { + output: { + $indexOfCP: [3, "s"], + } + } + }; + assertErrorCode(coll, pipeline, 40093); + + pipeline = { + $project: { + output: { + $indexOfCP: ["s", 3], + } + } + }; + assertErrorCode(coll, pipeline, 40094); + + pipeline = { + $project: {output: {$indexOfCP: ["abc", "b", "bad"]}} + }; + assertErrorCode(coll, pipeline, 40096); + + pipeline = { + $project: {output: {$indexOfCP: ["abc", "b", 0, "bad"]}} + }; + assertErrorCode(coll, pipeline, 40096); + + pipeline = { + $project: {output: {$indexOfCP: ["abc", "b", -1]}} + }; + assertErrorCode(coll, pipeline, 40097); + + pipeline = { + $project: {output: {$indexOfCP: ["abc", "b", 1, -1]}} + }; + assertErrorCode(coll, pipeline, 40097); +}()); diff --git a/jstests/aggregation/extras/utils.js b/jstests/aggregation/extras/utils.js index c1a8fc5645d..d8203f56b94 100644 --- a/jstests/aggregation/extras/utils.js +++ b/jstests/aggregation/extras/utils.js @@ -8,8 +8,8 @@ function testExpression(coll, expression, result) { var res = coll.aggregate({$project: {output: expression}}).toArray(); - assert.eq(res.length, 1); - assert.eq(res[0].output, result); + assert.eq(res.length, 1, tojson(res)); + assert.eq(res[0].output, result, tojson(res)); } /* diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index f462052d856..8024ec39cf2 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -359,6 +359,44 @@ intrusive_ptr<Expression> Expression::parseOperand(BSONElement exprElement, } } +namespace { +/** + * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially + * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a + * continuation byte. + */ +bool isContinuationByte(char charByte) { + return (charByte & 0xc0) == 0x80; +} + +/** + * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially + * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading + * byte. + */ +bool isLeadingByte(char charByte) { + return (charByte & 0xc0) == 0xc0; +} + +/** + * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is + * a single-byte code point. + */ +bool isSingleByte(char charByte) { + return (charByte & 0x80) == 0x0; +} + +size_t getCodePointLength(char charByte) { + if (isSingleByte(charByte)) { + return 1; + } + + invariant(isLeadingByte(charByte)); + + // In UTF-8, the number of leading ones is the number of bytes the code point takes up. + return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8))); +} +} // namespace /* ----------------------- ExpressionAbs ---------------------------- */ @@ -2118,6 +2156,216 @@ const char* ExpressionIn::getOpName() const { return "$in"; } +/* ----------------------- ExpressionIndexOfArray ------------------ */ + +namespace { + +void uassertIfNotIntegralAndNonNegative(Value val, + StringData expressionName, + StringData argumentName) { + uassert(40096, + str::stream() << expressionName << "requires an integral " << argumentName + << ", found a value of type: " << typeName(val.getType()) + << ", with value: " << val.toString(), + val.integral()); + uassert(40097, + str::stream() << expressionName << " requires a nonnegative " << argumentName + << ", found: " << val.toString(), + val.coerceToInt() >= 0); +} + +} // namespace + +Value ExpressionIndexOfArray::evaluateInternal(Variables* vars) const { + Value arrayArg = vpOperand[0]->evaluateInternal(vars); + + if (arrayArg.nullish()) { + return Value(BSONNULL); + } + + uassert(40090, + str::stream() << "$indexOfArray requires an array as a first argument, found: " + << typeName(arrayArg.getType()), + arrayArg.isArray()); + + std::vector<Value> array = arrayArg.getArray(); + + Value searchItem = vpOperand[1]->evaluateInternal(vars); + + size_t startIndex = 0; + if (vpOperand.size() > 2) { + Value startIndexArg = vpOperand[2]->evaluateInternal(vars); + uassertIfNotIntegralAndNonNegative(startIndexArg, getOpName(), "starting index"); + startIndex = static_cast<size_t>(startIndexArg.coerceToInt()); + } + + size_t endIndex = array.size(); + if (vpOperand.size() > 3) { + Value endIndexArg = vpOperand[3]->evaluateInternal(vars); + uassertIfNotIntegralAndNonNegative(endIndexArg, getOpName(), "ending index"); + // Don't let 'endIndex' exceed the length of the array. + endIndex = std::min(array.size(), static_cast<size_t>(endIndexArg.coerceToInt())); + } + + for (size_t i = startIndex; i < endIndex; i++) { + if (array[i] == searchItem) { + return Value(static_cast<int>(i)); + } + } + + return Value(-1); +} + +REGISTER_EXPRESSION(indexOfArray, ExpressionIndexOfArray::parse); +const char* ExpressionIndexOfArray::getOpName() const { + return "$indexOfArray"; +} + +/* ----------------------- ExpressionIndexOfBytes ------------------ */ + +namespace { + +bool stringHasTokenAtIndex(size_t index, const std::string& input, const std::string& token) { + if (token.size() + index > input.size()) { + return false; + } + return input.compare(index, token.size(), token) == 0; +} + +} // namespace + +Value ExpressionIndexOfBytes::evaluateInternal(Variables* vars) const { + Value stringArg = vpOperand[0]->evaluateInternal(vars); + + if (stringArg.nullish()) { + return Value(BSONNULL); + } + + uassert(40091, + str::stream() << "$indexOfBytes requires a string as the first argument, found: " + << typeName(stringArg.getType()), + stringArg.getType() == String); + const std::string& input = stringArg.getString(); + + Value tokenArg = vpOperand[1]->evaluateInternal(vars); + uassert(40092, + str::stream() << "$indexOfBytes requires a string as the second argument, found: " + << typeName(tokenArg.getType()), + tokenArg.getType() == String); + const std::string& token = tokenArg.getString(); + + size_t startIndex = 0; + if (vpOperand.size() > 2) { + Value startIndexArg = vpOperand[2]->evaluateInternal(vars); + uassertIfNotIntegralAndNonNegative(startIndexArg, getOpName(), "starting index"); + startIndex = static_cast<size_t>(startIndexArg.coerceToInt()); + } + + size_t endIndex = input.size(); + if (vpOperand.size() > 3) { + Value endIndexArg = vpOperand[3]->evaluateInternal(vars); + uassertIfNotIntegralAndNonNegative(endIndexArg, getOpName(), "ending index"); + // Don't let 'endIndex' exceed the length of the string. + endIndex = std::min(input.size(), static_cast<size_t>(endIndexArg.coerceToInt())); + } + + if (startIndex > input.length() || endIndex < startIndex) { + return Value(-1); + } + + size_t position = input.substr(0, endIndex).find(token, startIndex); + if (position == std::string::npos) { + return Value(-1); + } + + return Value(static_cast<int>(position)); +} + +REGISTER_EXPRESSION(indexOfBytes, ExpressionIndexOfBytes::parse); +const char* ExpressionIndexOfBytes::getOpName() const { + return "$indexOfBytes"; +} + +/* ----------------------- ExpressionIndexOfCP --------------------- */ + +Value ExpressionIndexOfCP::evaluateInternal(Variables* vars) const { + Value stringArg = vpOperand[0]->evaluateInternal(vars); + + if (stringArg.nullish()) { + return Value(BSONNULL); + } + + uassert(40093, + str::stream() << "$indexOfCP requires a string as the first argument, found: " + << typeName(stringArg.getType()), + stringArg.getType() == String); + const std::string& input = stringArg.getString(); + + Value tokenArg = vpOperand[1]->evaluateInternal(vars); + uassert(40094, + str::stream() << "$indexOfCP requires a string as the second argument, found: " + << typeName(tokenArg.getType()), + tokenArg.getType() == String); + const std::string& token = tokenArg.getString(); + + size_t startCodePointIndex = 0; + if (vpOperand.size() > 2) { + Value startIndexArg = vpOperand[2]->evaluateInternal(vars); + uassertIfNotIntegralAndNonNegative(startIndexArg, getOpName(), "starting index"); + startCodePointIndex = static_cast<size_t>(startIndexArg.coerceToInt()); + } + + // Compute the length (in code points) of the input, and convert 'startCodePointIndex' to a byte + // index. + size_t codePointLength = 0; + size_t startByteIndex = 0; + for (size_t byteIx = 0; byteIx < input.size(); ++codePointLength) { + if (codePointLength == startCodePointIndex) { + // We have determined the byte at which our search will start. + startByteIndex = byteIx; + } + + uassert( + 40095, "$indexOfCP found bad UTF-8 in the input", !isContinuationByte(input[byteIx])); + byteIx += getCodePointLength(input[byteIx]); + } + + size_t endCodePointIndex = codePointLength; + if (vpOperand.size() > 3) { + Value endIndexArg = vpOperand[3]->evaluateInternal(vars); + uassertIfNotIntegralAndNonNegative(endIndexArg, getOpName(), "ending index"); + + // Don't let 'endCodePointIndex' exceed the number of code points in the string. + endCodePointIndex = + std::min(codePointLength, static_cast<size_t>(endIndexArg.coerceToInt())); + } + + if (startByteIndex == 0 && input.empty() && token.empty()) { + // If we are finding the index of "" in the string "", the below loop will not loop, so we + // need a special case for this. + return Value(0); + } + + // We must keep track of which byte, and which code point, we are examining, being careful not + // to overflow either the length of the string or the ending code point. + + size_t currentCodePointIndex = startCodePointIndex; + for (size_t byteIx = startByteIndex; currentCodePointIndex < endCodePointIndex; + ++currentCodePointIndex) { + if (stringHasTokenAtIndex(byteIx, input, token)) { + return Value(static_cast<int>(currentCodePointIndex)); + } + byteIx += getCodePointLength(input[byteIx]); + } + + return Value(-1); +} + +REGISTER_EXPRESSION(indexOfCP, ExpressionIndexOfCP::parse); +const char* ExpressionIndexOfCP::getOpName() const { + return "$indexOfCP"; +} + /* ----------------------- ExpressionLn ---------------------------- */ Value ExpressionLn::evaluateNumericArg(const Value& numericArg) const { @@ -3071,18 +3319,6 @@ const char* ExpressionSize::getOpName() const { /* ----------------------- ExpressionSplit --------------------------- */ -namespace { - -bool stringHasTokenAtIndex(size_t index, const std::string& input, const std::string& token) { - if (token.size() + index > input.size()) { - return false; - } - - return input.compare(index, token.size(), token) == 0; -} - -} // namespace - Value ExpressionSplit::evaluateInternal(Variables* vars) const { Value inputArg = vpOperand[0]->evaluateInternal(vars); Value separatorArg = vpOperand[1]->evaluateInternal(vars); @@ -3174,45 +3410,6 @@ const char* ExpressionStrcasecmp::getOpName() const { return "$strcasecmp"; } -namespace { -/** - * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially - * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a - * continuation byte. - */ -bool isContinuationByte(char charByte) { - return (charByte & 0xc0) == 0x80; -} - -/** - * UTF-8 multi-byte code points consist of one leading byte of the form 11xxxxxx, and potentially - * many continuation bytes of the form 10xxxxxx. This method checks whether 'charByte' is a leading - * byte. - */ -bool isLeadingByte(char charByte) { - return (charByte & 0xc0) == 0xc0; -} - -/** - * UTF-8 single-byte code points are of the form 0xxxxxxx. This method checks whether 'charByte' is - * a single-byte code point. - */ -bool isSingleByte(char charByte) { - return (charByte & 0x80) == 0x0; -} - -size_t getCodePointLength(char charByte) { - if (isSingleByte(charByte)) { - return 1; - } - - invariant(isLeadingByte(charByte)); - - // In UTF-8, the number of leading ones is the number of bytes the code point takes up. - return countLeadingZeros64(~(uint64_t(charByte) << (64 - 8))); -} -} // namespace - /* ----------------------- ExpressionSubstrBytes ---------------------------- */ Value ExpressionSubstrBytes::evaluateInternal(Variables* vars) const { diff --git a/src/mongo/db/pipeline/expression.h b/src/mongo/db/pipeline/expression.h index 777fbd4dc8c..87a9536e689 100644 --- a/src/mongo/db/pipeline/expression.h +++ b/src/mongo/db/pipeline/expression.h @@ -859,6 +859,30 @@ public: }; +class ExpressionIndexOfArray final : public ExpressionRangedArity<ExpressionIndexOfArray, 2, 4> { +public: + Value evaluateInternal(Variables* vars) const final; + const char* getOpName() const final; +}; + + +class ExpressionIndexOfBytes final : public ExpressionRangedArity<ExpressionIndexOfBytes, 2, 4> { +public: + Value evaluateInternal(Variables* vars) const final; + const char* getOpName() const final; +}; + + +/** + * Implements indexOf behavior for strings with UTF-8 encoding. + */ +class ExpressionIndexOfCP final : public ExpressionRangedArity<ExpressionIndexOfCP, 2, 4> { +public: + Value evaluateInternal(Variables* vars) const final; + const char* getOpName() const final; +}; + + class ExpressionLet final : public Expression { public: boost::intrusive_ptr<Expression> optimize() final; |