summaryrefslogtreecommitdiff
path: root/src/mongo/db/pipeline/expression.cpp
diff options
context:
space:
mode:
authorIan Boros <puppyofkosh@gmail.com>2019-04-23 12:53:10 -0400
committerIan Boros <puppyofkosh@gmail.com>2019-04-23 12:56:00 -0400
commit03a6b3f910e051624dd6b3065b5ee9ea3b6c6fb5 (patch)
tree7b847a779c9b8f2ae37c4b43bb9018a7d048395c /src/mongo/db/pipeline/expression.cpp
parentc3a2176d5124ef3dc160db2b9f85c698e1ca2664 (diff)
downloadmongo-03a6b3f910e051624dd6b3065b5ee9ea3b6c6fb5.tar.gz
Revert "SERVER-40083 Don't recompile each time $regex is evaluated when regex argument is a constant"
This reverts commit a60f6a53734fa3a022e9ba39bbdab95608ba9108.
Diffstat (limited to 'src/mongo/db/pipeline/expression.cpp')
-rw-r--r--src/mongo/db/pipeline/expression.cpp444
1 files changed, 200 insertions, 244 deletions
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp
index 8b6b00739e4..364f8804cd9 100644
--- a/src/mongo/db/pipeline/expression.cpp
+++ b/src/mongo/db/pipeline/expression.cpp
@@ -1,4 +1,3 @@
-
/**
* Copyright (C) 2018-present MongoDB, Inc.
*
@@ -36,6 +35,7 @@
#include <algorithm>
#include <boost/algorithm/string.hpp>
#include <cstdio>
+#include <pcre.h>
#include <pcrecpp.h>
#include <vector>
@@ -5669,246 +5669,213 @@ Value ExpressionConvert::performConversion(BSONType targetType, Value inputValue
BSONType inputType = inputValue.getType();
return table.findConversionFunc(inputType, targetType)(getExpressionContext(), inputValue);
}
-namespace {
-
-boost::optional<Value> extractValueFromConstantExpression(
- const std::string& fieldName,
- const std::vector<std::pair<std::string, boost::intrusive_ptr<Expression>>>& childExpressions) {
- // Find the element with the fieldName.
- auto expressionPairItr = std::find_if(
- childExpressions.begin(), childExpressions.end(), [&](const auto& childExpression) {
- return childExpression.first == fieldName;
- });
- // If the field doesn't exists it is still eligible for optimization.
- if (expressionPairItr == childExpressions.end()) {
- return Value(BSONNULL);
- }
+namespace {
- // If the field exists and not null/constant, we cannot optimize.
- if (!ExpressionConstant::isNullOrConstant(expressionPairItr->second)) {
- return boost::none;
+class RegexMatchHandler {
+public:
+ RegexMatchHandler(const Value& inputExpr) : _pcre(nullptr), _nullish(false) {
+ _validateInputAndExtractElements(inputExpr);
+ _compile(regex_util::flags2PcreOptions(_options, false).all_options());
}
- auto* expression = expressionPairItr->second.get();
- return dynamic_cast<ExpressionConstant*>(expression)->getValue();
-}
-
-} // namespace
-void RegexMatchHandler::optimize(boost::intrusive_ptr<Expression> expression) {
- auto optimizedExpr = expression->optimize();
-
- // If 'input', 'regex' and 'options' are null/constant then 'optimize()' will convert the
- // object to an expression of type 'ExpressionConstant'.
- if (auto* exprObj = dynamic_cast<ExpressionConstant*>(optimizedExpr.get())) {
- _initialExecStateForConstantRegex = buildInitialState(exprObj->getValue());
- } else if (auto* exprObj = dynamic_cast<ExpressionObject*>(optimizedExpr.get())) {
- // Extract the children and check for constant 'regex' and 'options'.
- auto& children = exprObj->getChildExpressions();
- auto regex = extractValueFromConstantExpression("regex", children);
- auto options = extractValueFromConstantExpression("options", children);
-
- // If both 'regex' and 'options' are null/constant, we can pre-compile the execution state.
- if (regex && options) {
- RegexExecutionState executionState;
- _extractRegexAndOptions(&executionState, *regex, *options);
- _compile(&executionState);
- _initialExecStateForConstantRegex = std::move(executionState);
+ ~RegexMatchHandler() {
+ if (_pcre != nullptr) {
+ pcre_free(_pcre);
}
}
-}
-
-RegexMatchHandler::RegexExecutionState RegexMatchHandler::buildInitialState(
- const Value& inputExpr) const {
- uassert(51103,
- str::stream() << "expression expects an object of named arguments, but found type "
- << inputExpr.getType(),
- inputExpr.getType() == BSONType::Object);
- Value textInput = inputExpr.getDocument().getField("input");
- Value regexPattern = inputExpr.getDocument().getField("regex");
- Value regexOptions = inputExpr.getDocument().getField("options");
-
- auto executionState = _initialExecStateForConstantRegex.value_or(RegexExecutionState());
-
- // The 'input' parameter can be a variable and needs to be extracted from the expression
- // document even when '_preExecutionState' is present.
- _extractInputField(&executionState, textInput);
-
- // If we have a prebuilt execution state, then the 'regex' and 'options' fields are constant
- // values, and we do not need to re-compile them.
- if (!hasConstantRegex()) {
- _extractRegexAndOptions(&executionState, regexPattern, regexOptions);
- _compile(&executionState);
- }
-
- return executionState;
-}
-
-int RegexMatchHandler::execute(RegexExecutionState* regexState) const {
- invariant(regexState);
- invariant(!regexState->nullish());
- invariant(regexState->pcre);
- int execResult = pcre_exec(regexState->pcre.get(),
- 0,
- regexState->input->c_str(),
- regexState->input->size(),
- regexState->startBytePos,
- 0, // No need to overwrite the options set during pcre_compile.
- &(regexState->capturesBuffer.front()),
- regexState->capturesBuffer.size());
- // The 'execResult' will be (numCaptures + 1) if there is a match, -1 if there is no match,
- // negative (other than -1) if there is an error during execution, and zero if capturesBuffer's
- // capacity is not sufficient to hold all the results. The latter scenario should never occur.
- uassert(51156,
+ int execute(int startBytePos) {
+ int execResult = pcre_exec(_pcre,
+ 0,
+ _input.c_str(),
+ _input.size(),
+ startBytePos,
+ 0, // No need to overwrite the options set during pcre_compile.
+ &_capturesBuffer.front(),
+ _capturesBuffer.size());
+ // The 'execResult' will be (_numCaptures + 1) if there is a match, -1 if there is no
+ // match, negative if there is an error during execution, and zero if _capturesBuffer's
+ // capacity is not sufficient to hold all the results. The latter scenario should never
+ // occur.
+ uassert(
+ 51156,
str::stream() << "Error occurred while executing the regular expression. Result code:"
<< execResult,
- execResult == -1 || execResult == (regexState->numCaptures + 1));
- return execResult;
-}
+ execResult == -1 || execResult == (_numCaptures + 1));
+ return execResult;
+ }
-Value RegexMatchHandler::nextMatch(RegexExecutionState* regexState) const {
- int execResult = execute(regexState);
+ /**
+ * The function will match '_input' string based on the regex pattern present in '_pcre'. If
+ * there is a match, the function will return a 'Value' object encapsulating the matched string,
+ * the code point index of the matched string and a vector representing all the captured
+ * substrings. The function will also update the parameters 'startBytePos' and
+ * 'startCodePointPos' to the corresponding new indices. If there is no match, the function will
+ * return null 'Value' object.
+ */
+ Value nextMatch(int* startBytePos, int* startCodePointPos) {
+ invariant(startBytePos != nullptr && startCodePointPos != nullptr);
+
+ // Use input as StringData throughout the function to avoid copying the string on 'substr'
+ // calls.
+ StringData input = _input;
+ int execResult = execute(*startBytePos);
+ // No match.
+ if (execResult < 0) {
+ return Value(BSONNULL);
+ }
- // No match.
- if (execResult < 0) {
- return Value(BSONNULL);
+ // The first and second entries of the '_capturesBuffer' will have the start and limit
+ // indices of the matched string, as byte offsets. '(limit - startIndex)' would be the
+ // length of the captured string.
+ const int matchStartByteIndex = _capturesBuffer[0];
+ StringData matchedStr =
+ input.substr(matchStartByteIndex, _capturesBuffer[1] - matchStartByteIndex);
+ // We iterate through the input string's contents preceding the match index, in order to
+ // convert the byte offset to a code point offset.
+ for (int byteIx = *startBytePos; byteIx < matchStartByteIndex; ++(*startCodePointPos)) {
+ byteIx += getCodePointLength(input[byteIx]);
+ }
+ // Set the start index for match to the new one.
+ *startBytePos = matchStartByteIndex;
+
+ std::vector<Value> captures;
+ captures.reserve(_numCaptures);
+ // The next '2 * numCaptures' entries (after the first two entries) of '_capturesBuffer'
+ // will hold the start index and limit pairs, for each of the capture groups. We skip the
+ // first two elements and start iteration from 3rd element so that we only construct the
+ // strings for capture groups.
+ for (int i = 0; i < _numCaptures; ++i) {
+ const int start = _capturesBuffer[2 * (i + 1)];
+ const int limit = _capturesBuffer[2 * (i + 1) + 1];
+ captures.push_back(Value(input.substr(start, limit - start)));
+ }
+
+ MutableDocument match;
+ match.addField("match", Value(matchedStr));
+ match.addField("idx", Value(*startCodePointPos));
+ match.addField("captures", Value(captures));
+ return match.freezeToValue();
}
- // Use 'input' as StringData throughout the function to avoid copying the string on 'substr'
- // calls.
- StringData input = *(regexState->input);
+ int numCaptures() {
+ return _numCaptures;
+ }
- // The first and second entries of the 'capturesBuffer' will have the start and (end+1) indices
- // of the matched string, as byte offsets. '(limit - startIndex)' would be the length of the
- // captured string.
- const int matchStartByteIndex = regexState->capturesBuffer[0];
- StringData matchedStr =
- input.substr(matchStartByteIndex, regexState->capturesBuffer[1] - matchStartByteIndex);
+ bool nullish() {
+ return _nullish;
+ }
- // We iterate through the input string's contents preceding the match index, in order to convert
- // the byte offset to a code point offset.
- for (int byteIx = regexState->startBytePos; byteIx < matchStartByteIndex;
- ++(regexState->startCodePointPos)) {
- byteIx += getCodePointLength(input[byteIx]);
+ StringData getInput() {
+ return _input;
}
- // Set the start index for match to the new one.
- regexState->startBytePos = matchStartByteIndex;
-
- std::vector<Value> captures;
- captures.reserve(regexState->numCaptures);
-
- // The next '2 * numCaptures' entries (after the first two entries) of 'capturesBuffer' will
- // hold the start index and limit pairs, for each of the capture groups. We skip the first two
- // elements and start iteration from 3rd element so that we only construct the strings for
- // capture groups.
- for (int i = 0; i < regexState->numCaptures; ++i) {
- const int start = regexState->capturesBuffer[2 * (i + 1)];
- const int limit = regexState->capturesBuffer[2 * (i + 1) + 1];
- captures.push_back(Value(input.substr(start, limit - start)));
- }
-
- MutableDocument match;
- match.addField("match", Value(matchedStr));
- match.addField("idx", Value(regexState->startCodePointPos));
- match.addField("captures", Value(captures));
- return match.freezeToValue();
-}
-
-void RegexMatchHandler::_compile(RegexExecutionState* executionState) const {
- const auto pcreOptions =
- regex_util::flags2PcreOptions(executionState->options, false).all_options();
-
- if (!executionState->pattern) {
- return;
- }
- const char* compile_error;
- int eoffset;
-
- // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the index of
- // the match). So we are using the C interface. First we compile all the regex options to
- // generate pcre object, which will later be used to match against the input string.
- executionState->pcre = std::shared_ptr<pcre>(
- pcre_compile(
- executionState->pattern->c_str(), pcreOptions, &compile_error, &eoffset, nullptr),
- pcre_free);
- uassert(51111, str::stream() << "Invalid Regex: " << compile_error, executionState->pcre);
-
- // Calculate the number of capture groups present in 'pattern' and store in 'numCaptures'.
- const int pcre_retval = pcre_fullinfo(
- executionState->pcre.get(), NULL, PCRE_INFO_CAPTURECOUNT, &executionState->numCaptures);
- invariant(pcre_retval == 0);
-
- // The first two-thirds of the vector is used to pass back captured substrings' start and
- // (end+1) indexes. The remaining third of the vector is used as workspace by pcre_exec() while
- // matching capturing subpatterns, and is not available for passing back information.
- // pcre_compile will error if there are too many capture groups in the pattern. As long as this
- // memory is allocated after compile, the amount of memory allocated will not be too high.
- executionState->capturesBuffer.resize((1 + executionState->numCaptures) * 3);
-}
-
-void RegexMatchHandler::_extractInputField(RegexExecutionState* executionState,
- const Value& textInput) const {
- uassert(51104,
- "'input' field should be of type string",
- textInput.nullish() || textInput.getType() == BSONType::String);
- if (textInput.getType() == BSONType::String) {
- executionState->input = textInput.getString();
- }
-}
-
-void RegexMatchHandler::_extractRegexAndOptions(RegexExecutionState* executionState,
- const Value& regexPattern,
- const Value& regexOptions) const {
- uassert(51105,
- "'regex' field should be of type string or regex",
- regexPattern.nullish() || regexPattern.getType() == BSONType::String ||
- regexPattern.getType() == BSONType::RegEx);
- uassert(51106,
- "'options' should be of type string",
- regexOptions.nullish() || regexOptions.getType() == BSONType::String);
-
- // The 'regex' field can be a RegEx object and may have its own options...
- if (regexPattern.getType() == BSONType::RegEx) {
- StringData regexFlags = regexPattern.getRegexFlags();
- executionState->pattern = regexPattern.getRegex();
- uassert(
- 51107,
- str::stream() << "Found regex option(s) specified in both 'regex' and 'option' fields",
- regexOptions.nullish() || regexFlags.empty());
- if (!regexFlags.empty()) {
- executionState->options = regexFlags.toString();
+private:
+ RegexMatchHandler(const RegexMatchHandler&) = delete;
+
+ void _compile(const int pcreOptions) {
+ const char* compile_error;
+ int eoffset;
+ // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the
+ // index of the match). So we are using the C interface. First we compile all the regex
+ // options to generate pcre object, which will later be used to match against the input
+ // string.
+ _pcre = pcre_compile(_pattern.c_str(), pcreOptions, &compile_error, &eoffset, nullptr);
+ if (_pcre == nullptr) {
+ uasserted(51111, str::stream() << "Invalid Regex: " << compile_error);
}
- } else if (regexPattern.getType() == BSONType::String) {
- // ...or it can be a string field with options specified separately.
- executionState->pattern = regexPattern.getString();
- }
- // If 'options' is non-null, we must extract and validate its contents even if 'regexPattern' is
- // nullish.
- if (!regexOptions.nullish()) {
- executionState->options = regexOptions.getString();
- }
- uassert(51109,
- "Regular expression cannot contain an embedded null byte",
- !executionState->pattern || executionState->pattern->find('\0', 0) == string::npos);
- uassert(51110,
- "Regular expression options string cannot contain an embedded null byte",
- executionState->options.find('\0', 0) == string::npos);
-}
+ // Calculate the number of capture groups present in '_pattern' and store in '_numCaptures'.
+ int pcre_retval = pcre_fullinfo(_pcre, NULL, PCRE_INFO_CAPTURECOUNT, &_numCaptures);
+ invariant(pcre_retval == 0);
+
+ // The first two-thirds of the vector is used to pass back captured substrings' start and
+ // limit indexes. The remaining third of the vector is used as workspace by pcre_exec()
+ // while matching capturing subpatterns, and is not available for passing back information.
+ // pcre_compile will error if there are too many capture groups in the pattern. As long as
+ // this memory is allocated after compile, the amount of memory allocated will not be too
+ // high.
+ _capturesBuffer = std::vector<int>((1 + _numCaptures) * 3);
+ }
+
+ void _validateInputAndExtractElements(const Value& inputExpr) {
+ uassert(51103,
+ str::stream() << "$regexFind expects an object of named arguments, but found type "
+ << inputExpr.getType(),
+ inputExpr.getType() == BSONType::Object);
+ Value textInput = inputExpr.getDocument().getField("input");
+ Value regexPattern = inputExpr.getDocument().getField("regex");
+ Value regexOptions = inputExpr.getDocument().getField("options");
+
+ uassert(51104,
+ "'input' field should be of type string",
+ textInput.nullish() || textInput.getType() == BSONType::String);
+ uassert(51105,
+ "'regex' field should be of type string or regex",
+ regexPattern.nullish() || regexPattern.getType() == BSONType::String ||
+ regexPattern.getType() == BSONType::RegEx);
+ uassert(51106,
+ "'options' should be of type string",
+ regexOptions.nullish() || regexOptions.getType() == BSONType::String);
+
+ // If either the text input or regex pattern is nullish, then we consider the operation as a
+ // whole nullish.
+ _nullish = textInput.nullish() || regexPattern.nullish();
+
+ if (textInput.getType() == BSONType::String) {
+ _input = textInput.getString();
+ }
-boost::intrusive_ptr<Expression> ExpressionRegexFind::optimize() {
- _handler.optimize(vpOperand[0]);
- return this;
-}
+ // The 'regex' field can be a RegEx object and may have its own options...
+ if (regexPattern.getType() == BSONType::RegEx) {
+ StringData regexFlags = regexPattern.getRegexFlags();
+ _pattern = regexPattern.getRegex();
+ uassert(51107,
+ str::stream()
+ << "Found regex option(s) specified in both 'regex' and 'option' fields",
+ regexOptions.nullish() || regexFlags.empty());
+ if (!regexFlags.empty()) {
+ _options = regexFlags.toString();
+ }
+ } else if (regexPattern.getType() == BSONType::String) {
+ // ...or it can be a string field with options specified separately.
+ _pattern = regexPattern.getString();
+ }
+ // If 'options' is non-null, we must extract and validate its contents even if
+ // 'regexPattern' is nullish.
+ if (!regexOptions.nullish()) {
+ _options = regexOptions.getString();
+ }
+ uassert(51109,
+ "Regular expression cannot contain an embedded null byte",
+ _pattern.find('\0', 0) == string::npos);
+ uassert(51110,
+ "Regular expression options string cannot contain an embedded null byte",
+ _options.find('\0', 0) == string::npos);
+ }
+
+ pcre* _pcre;
+ // Number of capture groups present in '_pattern'.
+ int _numCaptures;
+ // Holds the start and limit indices of match and captures for the current match.
+ std::vector<int> _capturesBuffer;
+ std::string _input;
+ std::string _pattern;
+ std::string _options;
+ bool _nullish;
+};
+
+} // namespace
Value ExpressionRegexFind::evaluate(const Document& root) const {
- auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root));
- if (executionState.nullish()) {
+
+ RegexMatchHandler regex(vpOperand[0]->evaluate(root));
+ if (regex.nullish()) {
return Value(BSONNULL);
}
- return _handler.nextMatch(&executionState);
+ int startByteIndex = 0, startCodePointIndex = 0;
+ return regex.nextMatch(&startByteIndex, &startCodePointIndex);
}
REGISTER_EXPRESSION(regexFind, ExpressionRegexFind::parse);
@@ -5916,24 +5883,21 @@ const char* ExpressionRegexFind::getOpName() const {
return "$regexFind";
}
-boost::intrusive_ptr<Expression> ExpressionRegexFindAll::optimize() {
- _handler.optimize(vpOperand[0]);
- return this;
-}
-
Value ExpressionRegexFindAll::evaluate(const Document& root) const {
+
std::vector<Value> output;
- auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root));
- if (executionState.nullish()) {
+ RegexMatchHandler regex(vpOperand[0]->evaluate(root));
+ if (regex.nullish()) {
return Value(output);
}
- StringData input = *(executionState.input);
+ int startByteIndex = 0, startCodePointIndex = 0;
+ StringData input = regex.getInput();
size_t totalDocSize = 0;
// Using do...while loop because, when input is an empty string, we still want to see if there
// is a match.
do {
- auto matchObj = _handler.nextMatch(&executionState);
+ auto matchObj = regex.nextMatch(&startByteIndex, &startCodePointIndex);
if (matchObj.getType() == BSONType::jstNULL) {
break;
}
@@ -5949,22 +5913,19 @@ Value ExpressionRegexFindAll::evaluate(const Document& root) const {
// the character at startByteIndex matches the regex, we cannot return it since we are
// already returing an empty string starting at this index. So we move on to the next
// byte index.
- executionState.startBytePos += getCodePointLength(input[executionState.startBytePos]);
- ++executionState.startCodePointPos;
+ startByteIndex += getCodePointLength(input[startByteIndex]);
+ ++startCodePointIndex;
continue;
}
-
- // We don't want any overlapping sub-strings. So we move 'startBytePos' to point to the
+ // We don't want any overlapping sub-strings. So we move 'startByteIndex' to point to the
// byte after 'matchStr'. We move the code point index also correspondingly.
- executionState.startBytePos += matchStr.size();
- for (size_t byteIx = 0; byteIx < matchStr.size(); ++executionState.startCodePointPos) {
+ startByteIndex += matchStr.size();
+ for (size_t byteIx = 0; byteIx < matchStr.size(); ++startCodePointIndex) {
byteIx += getCodePointLength(matchStr[byteIx]);
}
-
- invariant(executionState.startBytePos > 0);
- invariant(executionState.startCodePointPos > 0);
- invariant(executionState.startCodePointPos <= executionState.startBytePos);
- } while (static_cast<size_t>(executionState.startBytePos) < input.size());
+ invariant(startByteIndex > 0 && startCodePointIndex > 0 &&
+ startCodePointIndex <= startByteIndex);
+ } while (static_cast<size_t>(startByteIndex) < input.size());
return Value(output);
}
@@ -5973,15 +5934,10 @@ const char* ExpressionRegexFindAll::getOpName() const {
return "$regexFindAll";
}
-boost::intrusive_ptr<Expression> ExpressionRegexMatch::optimize() {
- _handler.optimize(vpOperand[0]);
- return this;
-}
-
Value ExpressionRegexMatch::evaluate(const Document& root) const {
- auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root));
+ RegexMatchHandler regex(vpOperand[0]->evaluate(root));
// Return output of execute only if regex is not nullish.
- return executionState.nullish() ? Value(false) : Value(_handler.execute(&executionState) > 0);
+ return regex.nullish() ? Value(false) : Value(regex.execute(0) > 0);
}
REGISTER_EXPRESSION(regexMatch, ExpressionRegexMatch::parse);