diff options
author | Arun Banala <arun.banala@mongodb.com> | 2019-03-26 19:58:24 +0000 |
---|---|---|
committer | Arun Banala <arun.banala@mongodb.com> | 2019-04-26 15:19:48 +0100 |
commit | 7e700f6668fcbd5b96d46884364f1a7377945abb (patch) | |
tree | fd8d07efa6037c2ea49a6d295cf19621396e5528 /src/mongo/db/pipeline/expression.cpp | |
parent | 28430f0ec94060dbe2f5d3e8c77d77f5570b6de9 (diff) | |
download | mongo-7e700f6668fcbd5b96d46884364f1a7377945abb.tar.gz |
SERVER-40083 Don't recompile each time $regex is evaluated when regex argument is a constant
Diffstat (limited to 'src/mongo/db/pipeline/expression.cpp')
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 444 |
1 files changed, 244 insertions, 200 deletions
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index 364f8804cd9..ec0a0d8f435 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -1,3 +1,4 @@ + /** * Copyright (C) 2018-present MongoDB, Inc. * @@ -35,7 +36,6 @@ #include <algorithm> #include <boost/algorithm/string.hpp> #include <cstdio> -#include <pcre.h> #include <pcrecpp.h> #include <vector> @@ -5669,213 +5669,246 @@ Value ExpressionConvert::performConversion(BSONType targetType, Value inputValue BSONType inputType = inputValue.getType(); return table.findConversionFunc(inputType, targetType)(getExpressionContext(), inputValue); } - namespace { -class RegexMatchHandler { -public: - RegexMatchHandler(const Value& inputExpr) : _pcre(nullptr), _nullish(false) { - _validateInputAndExtractElements(inputExpr); - _compile(regex_util::flags2PcreOptions(_options, false).all_options()); - } +boost::optional<Value> extractValueFromConstantExpression( + const std::string& fieldName, + const std::vector<std::pair<std::string, boost::intrusive_ptr<Expression>>>& childExpressions) { + // Find the element with the fieldName. + auto expressionPairItr = std::find_if( + childExpressions.begin(), childExpressions.end(), [&](const auto& childExpression) { + return childExpression.first == fieldName; + }); - ~RegexMatchHandler() { - if (_pcre != nullptr) { - pcre_free(_pcre); - } + // If the field doesn't exists it is still eligible for optimization. + if (expressionPairItr == childExpressions.end()) { + return Value(BSONNULL); } - int execute(int startBytePos) { - int execResult = pcre_exec(_pcre, - 0, - _input.c_str(), - _input.size(), - startBytePos, - 0, // No need to overwrite the options set during pcre_compile. - &_capturesBuffer.front(), - _capturesBuffer.size()); - // The 'execResult' will be (_numCaptures + 1) if there is a match, -1 if there is no - // match, negative if there is an error during execution, and zero if _capturesBuffer's - // capacity is not sufficient to hold all the results. The latter scenario should never - // occur. - uassert( - 51156, - str::stream() << "Error occurred while executing the regular expression. Result code:" - << execResult, - execResult == -1 || execResult == (_numCaptures + 1)); - return execResult; + // If the field exists and not null/constant, we cannot optimize. + if (!ExpressionConstant::isNullOrConstant(expressionPairItr->second)) { + return boost::none; } + auto* expression = expressionPairItr->second.get(); + return dynamic_cast<ExpressionConstant*>(expression)->getValue(); +} - /** - * The function will match '_input' string based on the regex pattern present in '_pcre'. If - * there is a match, the function will return a 'Value' object encapsulating the matched string, - * the code point index of the matched string and a vector representing all the captured - * substrings. The function will also update the parameters 'startBytePos' and - * 'startCodePointPos' to the corresponding new indices. If there is no match, the function will - * return null 'Value' object. - */ - Value nextMatch(int* startBytePos, int* startCodePointPos) { - invariant(startBytePos != nullptr && startCodePointPos != nullptr); - - // Use input as StringData throughout the function to avoid copying the string on 'substr' - // calls. - StringData input = _input; - int execResult = execute(*startBytePos); - // No match. - if (execResult < 0) { - return Value(BSONNULL); - } +} // namespace - // The first and second entries of the '_capturesBuffer' will have the start and limit - // indices of the matched string, as byte offsets. '(limit - startIndex)' would be the - // length of the captured string. - const int matchStartByteIndex = _capturesBuffer[0]; - StringData matchedStr = - input.substr(matchStartByteIndex, _capturesBuffer[1] - matchStartByteIndex); - // We iterate through the input string's contents preceding the match index, in order to - // convert the byte offset to a code point offset. - for (int byteIx = *startBytePos; byteIx < matchStartByteIndex; ++(*startCodePointPos)) { - byteIx += getCodePointLength(input[byteIx]); - } - // Set the start index for match to the new one. - *startBytePos = matchStartByteIndex; - - std::vector<Value> captures; - captures.reserve(_numCaptures); - // The next '2 * numCaptures' entries (after the first two entries) of '_capturesBuffer' - // will hold the start index and limit pairs, for each of the capture groups. We skip the - // first two elements and start iteration from 3rd element so that we only construct the - // strings for capture groups. - for (int i = 0; i < _numCaptures; ++i) { - const int start = _capturesBuffer[2 * (i + 1)]; - const int limit = _capturesBuffer[2 * (i + 1) + 1]; - captures.push_back(Value(input.substr(start, limit - start))); +void RegexMatchHandler::optimize(boost::intrusive_ptr<Expression> expression) { + auto optimizedExpr = expression->optimize(); + + // If 'input', 'regex' and 'options' are null/constant then 'optimize()' will convert the + // object to an expression of type 'ExpressionConstant'. + if (auto* exprObj = dynamic_cast<ExpressionConstant*>(optimizedExpr.get())) { + _initialExecStateForConstantRegex = buildInitialState(exprObj->getValue()); + } else if (auto* exprObj = dynamic_cast<ExpressionObject*>(optimizedExpr.get())) { + // Extract the children and check for constant 'regex' and 'options'. + auto& children = exprObj->getChildExpressions(); + auto regex = extractValueFromConstantExpression("regex", children); + auto options = extractValueFromConstantExpression("options", children); + + // If both 'regex' and 'options' are null/constant, we can pre-compile the execution state. + if (regex && options) { + RegexExecutionState executionState; + _extractRegexAndOptions(&executionState, *regex, *options); + _compile(&executionState); + _initialExecStateForConstantRegex = std::move(executionState); } - - MutableDocument match; - match.addField("match", Value(matchedStr)); - match.addField("idx", Value(*startCodePointPos)); - match.addField("captures", Value(captures)); - return match.freezeToValue(); } +} - int numCaptures() { - return _numCaptures; - } +RegexMatchHandler::RegexExecutionState RegexMatchHandler::buildInitialState( + const Value& inputExpr) const { + uassert(51103, + str::stream() << "expression expects an object of named arguments, but found type " + << inputExpr.getType(), + inputExpr.getType() == BSONType::Object); + Value textInput = inputExpr.getDocument().getField("input"); + Value regexPattern = inputExpr.getDocument().getField("regex"); + Value regexOptions = inputExpr.getDocument().getField("options"); + + auto executionState = _initialExecStateForConstantRegex.value_or(RegexExecutionState()); - bool nullish() { - return _nullish; + // The 'input' parameter can be a variable and needs to be extracted from the expression + // document even when '_preExecutionState' is present. + _extractInputField(&executionState, textInput); + + // If we have a prebuilt execution state, then the 'regex' and 'options' fields are constant + // values, and we do not need to re-compile them. + if (!hasConstantRegex()) { + _extractRegexAndOptions(&executionState, regexPattern, regexOptions); + _compile(&executionState); } - StringData getInput() { - return _input; + return executionState; +} + +int RegexMatchHandler::execute(RegexExecutionState* regexState) const { + invariant(regexState); + invariant(!regexState->nullish()); + invariant(regexState->pcrePtr); + + int execResult = pcre_exec(regexState->pcrePtr.get(), + 0, + regexState->input->c_str(), + regexState->input->size(), + regexState->startBytePos, + 0, // No need to overwrite the options set during pcre_compile. + &(regexState->capturesBuffer.front()), + regexState->capturesBuffer.size()); + // The 'execResult' will be (numCaptures + 1) if there is a match, -1 if there is no match, + // negative (other than -1) if there is an error during execution, and zero if capturesBuffer's + // capacity is not sufficient to hold all the results. The latter scenario should never occur. + uassert(51156, + str::stream() << "Error occurred while executing the regular expression. Result code:" + << execResult, + execResult == -1 || execResult == (regexState->numCaptures + 1)); + return execResult; +} + +Value RegexMatchHandler::nextMatch(RegexExecutionState* regexState) const { + int execResult = execute(regexState); + + // No match. + if (execResult < 0) { + return Value(BSONNULL); } -private: - RegexMatchHandler(const RegexMatchHandler&) = delete; - - void _compile(const int pcreOptions) { - const char* compile_error; - int eoffset; - // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the - // index of the match). So we are using the C interface. First we compile all the regex - // options to generate pcre object, which will later be used to match against the input - // string. - _pcre = pcre_compile(_pattern.c_str(), pcreOptions, &compile_error, &eoffset, nullptr); - if (_pcre == nullptr) { - uasserted(51111, str::stream() << "Invalid Regex: " << compile_error); - } + // Use 'input' as StringData throughout the function to avoid copying the string on 'substr' + // calls. + StringData input = *(regexState->input); - // Calculate the number of capture groups present in '_pattern' and store in '_numCaptures'. - int pcre_retval = pcre_fullinfo(_pcre, NULL, PCRE_INFO_CAPTURECOUNT, &_numCaptures); - invariant(pcre_retval == 0); - - // The first two-thirds of the vector is used to pass back captured substrings' start and - // limit indexes. The remaining third of the vector is used as workspace by pcre_exec() - // while matching capturing subpatterns, and is not available for passing back information. - // pcre_compile will error if there are too many capture groups in the pattern. As long as - // this memory is allocated after compile, the amount of memory allocated will not be too - // high. - _capturesBuffer = std::vector<int>((1 + _numCaptures) * 3); - } - - void _validateInputAndExtractElements(const Value& inputExpr) { - uassert(51103, - str::stream() << "$regexFind expects an object of named arguments, but found type " - << inputExpr.getType(), - inputExpr.getType() == BSONType::Object); - Value textInput = inputExpr.getDocument().getField("input"); - Value regexPattern = inputExpr.getDocument().getField("regex"); - Value regexOptions = inputExpr.getDocument().getField("options"); - - uassert(51104, - "'input' field should be of type string", - textInput.nullish() || textInput.getType() == BSONType::String); - uassert(51105, - "'regex' field should be of type string or regex", - regexPattern.nullish() || regexPattern.getType() == BSONType::String || - regexPattern.getType() == BSONType::RegEx); - uassert(51106, - "'options' should be of type string", - regexOptions.nullish() || regexOptions.getType() == BSONType::String); - - // If either the text input or regex pattern is nullish, then we consider the operation as a - // whole nullish. - _nullish = textInput.nullish() || regexPattern.nullish(); - - if (textInput.getType() == BSONType::String) { - _input = textInput.getString(); - } + // The first and second entries of the 'capturesBuffer' will have the start and (end+1) indices + // of the matched string, as byte offsets. '(limit - startIndex)' would be the length of the + // captured string. + const int matchStartByteIndex = regexState->capturesBuffer[0]; + StringData matchedStr = + input.substr(matchStartByteIndex, regexState->capturesBuffer[1] - matchStartByteIndex); - // The 'regex' field can be a RegEx object and may have its own options... - if (regexPattern.getType() == BSONType::RegEx) { - StringData regexFlags = regexPattern.getRegexFlags(); - _pattern = regexPattern.getRegex(); - uassert(51107, - str::stream() - << "Found regex option(s) specified in both 'regex' and 'option' fields", - regexOptions.nullish() || regexFlags.empty()); - if (!regexFlags.empty()) { - _options = regexFlags.toString(); - } - } else if (regexPattern.getType() == BSONType::String) { - // ...or it can be a string field with options specified separately. - _pattern = regexPattern.getString(); - } - // If 'options' is non-null, we must extract and validate its contents even if - // 'regexPattern' is nullish. - if (!regexOptions.nullish()) { - _options = regexOptions.getString(); + // We iterate through the input string's contents preceding the match index, in order to convert + // the byte offset to a code point offset. + for (int byteIx = regexState->startBytePos; byteIx < matchStartByteIndex; + ++(regexState->startCodePointPos)) { + byteIx += getCodePointLength(input[byteIx]); + } + + // Set the start index for match to the new one. + regexState->startBytePos = matchStartByteIndex; + + std::vector<Value> captures; + captures.reserve(regexState->numCaptures); + + // The next '2 * numCaptures' entries (after the first two entries) of 'capturesBuffer' will + // hold the start index and limit pairs, for each of the capture groups. We skip the first two + // elements and start iteration from 3rd element so that we only construct the strings for + // capture groups. + for (int i = 0; i < regexState->numCaptures; ++i) { + const int start = regexState->capturesBuffer[2 * (i + 1)]; + const int limit = regexState->capturesBuffer[2 * (i + 1) + 1]; + captures.push_back(Value(input.substr(start, limit - start))); + } + + MutableDocument match; + match.addField("match", Value(matchedStr)); + match.addField("idx", Value(regexState->startCodePointPos)); + match.addField("captures", Value(captures)); + return match.freezeToValue(); +} + +void RegexMatchHandler::_compile(RegexExecutionState* executionState) const { + const auto pcreOptions = + regex_util::flags2PcreOptions(executionState->options, false).all_options(); + + if (!executionState->pattern) { + return; + } + const char* compile_error; + int eoffset; + + // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the index of + // the match). So we are using the C interface. First we compile all the regex options to + // generate pcre object, which will later be used to match against the input string. + executionState->pcrePtr = std::shared_ptr<pcre>( + pcre_compile( + executionState->pattern->c_str(), pcreOptions, &compile_error, &eoffset, nullptr), + pcre_free); + uassert(51111, str::stream() << "Invalid Regex: " << compile_error, executionState->pcrePtr); + + // Calculate the number of capture groups present in 'pattern' and store in 'numCaptures'. + const int pcre_retval = pcre_fullinfo( + executionState->pcrePtr.get(), NULL, PCRE_INFO_CAPTURECOUNT, &executionState->numCaptures); + invariant(pcre_retval == 0); + + // The first two-thirds of the vector is used to pass back captured substrings' start and + // (end+1) indexes. The remaining third of the vector is used as workspace by pcre_exec() while + // matching capturing subpatterns, and is not available for passing back information. + // pcre_compile will error if there are too many capture groups in the pattern. As long as this + // memory is allocated after compile, the amount of memory allocated will not be too high. + executionState->capturesBuffer.resize((1 + executionState->numCaptures) * 3); +} + +void RegexMatchHandler::_extractInputField(RegexExecutionState* executionState, + const Value& textInput) const { + uassert(51104, + "'input' field should be of type string", + textInput.nullish() || textInput.getType() == BSONType::String); + if (textInput.getType() == BSONType::String) { + executionState->input = textInput.getString(); + } +} + +void RegexMatchHandler::_extractRegexAndOptions(RegexExecutionState* executionState, + const Value& regexPattern, + const Value& regexOptions) const { + uassert(51105, + "'regex' field should be of type string or regex", + regexPattern.nullish() || regexPattern.getType() == BSONType::String || + regexPattern.getType() == BSONType::RegEx); + uassert(51106, + "'options' should be of type string", + regexOptions.nullish() || regexOptions.getType() == BSONType::String); + + // The 'regex' field can be a RegEx object and may have its own options... + if (regexPattern.getType() == BSONType::RegEx) { + StringData regexFlags = regexPattern.getRegexFlags(); + executionState->pattern = regexPattern.getRegex(); + uassert( + 51107, + str::stream() << "Found regex option(s) specified in both 'regex' and 'option' fields", + regexOptions.nullish() || regexFlags.empty()); + if (!regexFlags.empty()) { + executionState->options = regexFlags.toString(); } - uassert(51109, - "Regular expression cannot contain an embedded null byte", - _pattern.find('\0', 0) == string::npos); - uassert(51110, - "Regular expression options string cannot contain an embedded null byte", - _options.find('\0', 0) == string::npos); - } - - pcre* _pcre; - // Number of capture groups present in '_pattern'. - int _numCaptures; - // Holds the start and limit indices of match and captures for the current match. - std::vector<int> _capturesBuffer; - std::string _input; - std::string _pattern; - std::string _options; - bool _nullish; -}; + } else if (regexPattern.getType() == BSONType::String) { + // ...or it can be a string field with options specified separately. + executionState->pattern = regexPattern.getString(); + } -} // namespace + // If 'options' is non-null, we must extract and validate its contents even if 'regexPattern' is + // nullish. + if (!regexOptions.nullish()) { + executionState->options = regexOptions.getString(); + } + uassert(51109, + "Regular expression cannot contain an embedded null byte", + !executionState->pattern || executionState->pattern->find('\0', 0) == string::npos); + uassert(51110, + "Regular expression options string cannot contain an embedded null byte", + executionState->options.find('\0', 0) == string::npos); +} -Value ExpressionRegexFind::evaluate(const Document& root) const { +boost::intrusive_ptr<Expression> ExpressionRegexFind::optimize() { + _handler.optimize(vpOperand[0]); + return this; +} - RegexMatchHandler regex(vpOperand[0]->evaluate(root)); - if (regex.nullish()) { +Value ExpressionRegexFind::evaluate(const Document& root) const { + auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root)); + if (executionState.nullish()) { return Value(BSONNULL); } - int startByteIndex = 0, startCodePointIndex = 0; - return regex.nextMatch(&startByteIndex, &startCodePointIndex); + return _handler.nextMatch(&executionState); } REGISTER_EXPRESSION(regexFind, ExpressionRegexFind::parse); @@ -5883,21 +5916,24 @@ const char* ExpressionRegexFind::getOpName() const { return "$regexFind"; } -Value ExpressionRegexFindAll::evaluate(const Document& root) const { +boost::intrusive_ptr<Expression> ExpressionRegexFindAll::optimize() { + _handler.optimize(vpOperand[0]); + return this; +} +Value ExpressionRegexFindAll::evaluate(const Document& root) const { std::vector<Value> output; - RegexMatchHandler regex(vpOperand[0]->evaluate(root)); - if (regex.nullish()) { + auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root)); + if (executionState.nullish()) { return Value(output); } - int startByteIndex = 0, startCodePointIndex = 0; - StringData input = regex.getInput(); + StringData input = *(executionState.input); size_t totalDocSize = 0; // Using do...while loop because, when input is an empty string, we still want to see if there // is a match. do { - auto matchObj = regex.nextMatch(&startByteIndex, &startCodePointIndex); + auto matchObj = _handler.nextMatch(&executionState); if (matchObj.getType() == BSONType::jstNULL) { break; } @@ -5913,19 +5949,22 @@ Value ExpressionRegexFindAll::evaluate(const Document& root) const { // the character at startByteIndex matches the regex, we cannot return it since we are // already returing an empty string starting at this index. So we move on to the next // byte index. - startByteIndex += getCodePointLength(input[startByteIndex]); - ++startCodePointIndex; + executionState.startBytePos += getCodePointLength(input[executionState.startBytePos]); + ++executionState.startCodePointPos; continue; } - // We don't want any overlapping sub-strings. So we move 'startByteIndex' to point to the + + // We don't want any overlapping sub-strings. So we move 'startBytePos' to point to the // byte after 'matchStr'. We move the code point index also correspondingly. - startByteIndex += matchStr.size(); - for (size_t byteIx = 0; byteIx < matchStr.size(); ++startCodePointIndex) { + executionState.startBytePos += matchStr.size(); + for (size_t byteIx = 0; byteIx < matchStr.size(); ++executionState.startCodePointPos) { byteIx += getCodePointLength(matchStr[byteIx]); } - invariant(startByteIndex > 0 && startCodePointIndex > 0 && - startCodePointIndex <= startByteIndex); - } while (static_cast<size_t>(startByteIndex) < input.size()); + + invariant(executionState.startBytePos > 0); + invariant(executionState.startCodePointPos > 0); + invariant(executionState.startCodePointPos <= executionState.startBytePos); + } while (static_cast<size_t>(executionState.startBytePos) < input.size()); return Value(output); } @@ -5934,10 +5973,15 @@ const char* ExpressionRegexFindAll::getOpName() const { return "$regexFindAll"; } +boost::intrusive_ptr<Expression> ExpressionRegexMatch::optimize() { + _handler.optimize(vpOperand[0]); + return this; +} + Value ExpressionRegexMatch::evaluate(const Document& root) const { - RegexMatchHandler regex(vpOperand[0]->evaluate(root)); + auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root)); // Return output of execute only if regex is not nullish. - return regex.nullish() ? Value(false) : Value(regex.execute(0) > 0); + return executionState.nullish() ? Value(false) : Value(_handler.execute(&executionState) > 0); } REGISTER_EXPRESSION(regexMatch, ExpressionRegexMatch::parse); |