diff options
author | Ian Boros <puppyofkosh@gmail.com> | 2019-04-23 12:53:10 -0400 |
---|---|---|
committer | Ian Boros <puppyofkosh@gmail.com> | 2019-04-23 12:56:00 -0400 |
commit | 03a6b3f910e051624dd6b3065b5ee9ea3b6c6fb5 (patch) | |
tree | 7b847a779c9b8f2ae37c4b43bb9018a7d048395c /src/mongo/db/pipeline/expression.cpp | |
parent | c3a2176d5124ef3dc160db2b9f85c698e1ca2664 (diff) | |
download | mongo-03a6b3f910e051624dd6b3065b5ee9ea3b6c6fb5.tar.gz |
Revert "SERVER-40083 Don't recompile each time $regex is evaluated when regex argument is a constant"
This reverts commit a60f6a53734fa3a022e9ba39bbdab95608ba9108.
Diffstat (limited to 'src/mongo/db/pipeline/expression.cpp')
-rw-r--r-- | src/mongo/db/pipeline/expression.cpp | 444 |
1 files changed, 200 insertions, 244 deletions
diff --git a/src/mongo/db/pipeline/expression.cpp b/src/mongo/db/pipeline/expression.cpp index 8b6b00739e4..364f8804cd9 100644 --- a/src/mongo/db/pipeline/expression.cpp +++ b/src/mongo/db/pipeline/expression.cpp @@ -1,4 +1,3 @@ - /** * Copyright (C) 2018-present MongoDB, Inc. * @@ -36,6 +35,7 @@ #include <algorithm> #include <boost/algorithm/string.hpp> #include <cstdio> +#include <pcre.h> #include <pcrecpp.h> #include <vector> @@ -5669,246 +5669,213 @@ Value ExpressionConvert::performConversion(BSONType targetType, Value inputValue BSONType inputType = inputValue.getType(); return table.findConversionFunc(inputType, targetType)(getExpressionContext(), inputValue); } -namespace { - -boost::optional<Value> extractValueFromConstantExpression( - const std::string& fieldName, - const std::vector<std::pair<std::string, boost::intrusive_ptr<Expression>>>& childExpressions) { - // Find the element with the fieldName. - auto expressionPairItr = std::find_if( - childExpressions.begin(), childExpressions.end(), [&](const auto& childExpression) { - return childExpression.first == fieldName; - }); - // If the field doesn't exists it is still eligible for optimization. - if (expressionPairItr == childExpressions.end()) { - return Value(BSONNULL); - } +namespace { - // If the field exists and not null/constant, we cannot optimize. - if (!ExpressionConstant::isNullOrConstant(expressionPairItr->second)) { - return boost::none; +class RegexMatchHandler { +public: + RegexMatchHandler(const Value& inputExpr) : _pcre(nullptr), _nullish(false) { + _validateInputAndExtractElements(inputExpr); + _compile(regex_util::flags2PcreOptions(_options, false).all_options()); } - auto* expression = expressionPairItr->second.get(); - return dynamic_cast<ExpressionConstant*>(expression)->getValue(); -} - -} // namespace -void RegexMatchHandler::optimize(boost::intrusive_ptr<Expression> expression) { - auto optimizedExpr = expression->optimize(); - - // If 'input', 'regex' and 'options' are null/constant then 'optimize()' will convert the - // object to an expression of type 'ExpressionConstant'. - if (auto* exprObj = dynamic_cast<ExpressionConstant*>(optimizedExpr.get())) { - _initialExecStateForConstantRegex = buildInitialState(exprObj->getValue()); - } else if (auto* exprObj = dynamic_cast<ExpressionObject*>(optimizedExpr.get())) { - // Extract the children and check for constant 'regex' and 'options'. - auto& children = exprObj->getChildExpressions(); - auto regex = extractValueFromConstantExpression("regex", children); - auto options = extractValueFromConstantExpression("options", children); - - // If both 'regex' and 'options' are null/constant, we can pre-compile the execution state. - if (regex && options) { - RegexExecutionState executionState; - _extractRegexAndOptions(&executionState, *regex, *options); - _compile(&executionState); - _initialExecStateForConstantRegex = std::move(executionState); + ~RegexMatchHandler() { + if (_pcre != nullptr) { + pcre_free(_pcre); } } -} - -RegexMatchHandler::RegexExecutionState RegexMatchHandler::buildInitialState( - const Value& inputExpr) const { - uassert(51103, - str::stream() << "expression expects an object of named arguments, but found type " - << inputExpr.getType(), - inputExpr.getType() == BSONType::Object); - Value textInput = inputExpr.getDocument().getField("input"); - Value regexPattern = inputExpr.getDocument().getField("regex"); - Value regexOptions = inputExpr.getDocument().getField("options"); - - auto executionState = _initialExecStateForConstantRegex.value_or(RegexExecutionState()); - - // The 'input' parameter can be a variable and needs to be extracted from the expression - // document even when '_preExecutionState' is present. - _extractInputField(&executionState, textInput); - - // If we have a prebuilt execution state, then the 'regex' and 'options' fields are constant - // values, and we do not need to re-compile them. - if (!hasConstantRegex()) { - _extractRegexAndOptions(&executionState, regexPattern, regexOptions); - _compile(&executionState); - } - - return executionState; -} - -int RegexMatchHandler::execute(RegexExecutionState* regexState) const { - invariant(regexState); - invariant(!regexState->nullish()); - invariant(regexState->pcre); - int execResult = pcre_exec(regexState->pcre.get(), - 0, - regexState->input->c_str(), - regexState->input->size(), - regexState->startBytePos, - 0, // No need to overwrite the options set during pcre_compile. - &(regexState->capturesBuffer.front()), - regexState->capturesBuffer.size()); - // The 'execResult' will be (numCaptures + 1) if there is a match, -1 if there is no match, - // negative (other than -1) if there is an error during execution, and zero if capturesBuffer's - // capacity is not sufficient to hold all the results. The latter scenario should never occur. - uassert(51156, + int execute(int startBytePos) { + int execResult = pcre_exec(_pcre, + 0, + _input.c_str(), + _input.size(), + startBytePos, + 0, // No need to overwrite the options set during pcre_compile. + &_capturesBuffer.front(), + _capturesBuffer.size()); + // The 'execResult' will be (_numCaptures + 1) if there is a match, -1 if there is no + // match, negative if there is an error during execution, and zero if _capturesBuffer's + // capacity is not sufficient to hold all the results. The latter scenario should never + // occur. + uassert( + 51156, str::stream() << "Error occurred while executing the regular expression. Result code:" << execResult, - execResult == -1 || execResult == (regexState->numCaptures + 1)); - return execResult; -} + execResult == -1 || execResult == (_numCaptures + 1)); + return execResult; + } -Value RegexMatchHandler::nextMatch(RegexExecutionState* regexState) const { - int execResult = execute(regexState); + /** + * The function will match '_input' string based on the regex pattern present in '_pcre'. If + * there is a match, the function will return a 'Value' object encapsulating the matched string, + * the code point index of the matched string and a vector representing all the captured + * substrings. The function will also update the parameters 'startBytePos' and + * 'startCodePointPos' to the corresponding new indices. If there is no match, the function will + * return null 'Value' object. + */ + Value nextMatch(int* startBytePos, int* startCodePointPos) { + invariant(startBytePos != nullptr && startCodePointPos != nullptr); + + // Use input as StringData throughout the function to avoid copying the string on 'substr' + // calls. + StringData input = _input; + int execResult = execute(*startBytePos); + // No match. + if (execResult < 0) { + return Value(BSONNULL); + } - // No match. - if (execResult < 0) { - return Value(BSONNULL); + // The first and second entries of the '_capturesBuffer' will have the start and limit + // indices of the matched string, as byte offsets. '(limit - startIndex)' would be the + // length of the captured string. + const int matchStartByteIndex = _capturesBuffer[0]; + StringData matchedStr = + input.substr(matchStartByteIndex, _capturesBuffer[1] - matchStartByteIndex); + // We iterate through the input string's contents preceding the match index, in order to + // convert the byte offset to a code point offset. + for (int byteIx = *startBytePos; byteIx < matchStartByteIndex; ++(*startCodePointPos)) { + byteIx += getCodePointLength(input[byteIx]); + } + // Set the start index for match to the new one. + *startBytePos = matchStartByteIndex; + + std::vector<Value> captures; + captures.reserve(_numCaptures); + // The next '2 * numCaptures' entries (after the first two entries) of '_capturesBuffer' + // will hold the start index and limit pairs, for each of the capture groups. We skip the + // first two elements and start iteration from 3rd element so that we only construct the + // strings for capture groups. + for (int i = 0; i < _numCaptures; ++i) { + const int start = _capturesBuffer[2 * (i + 1)]; + const int limit = _capturesBuffer[2 * (i + 1) + 1]; + captures.push_back(Value(input.substr(start, limit - start))); + } + + MutableDocument match; + match.addField("match", Value(matchedStr)); + match.addField("idx", Value(*startCodePointPos)); + match.addField("captures", Value(captures)); + return match.freezeToValue(); } - // Use 'input' as StringData throughout the function to avoid copying the string on 'substr' - // calls. - StringData input = *(regexState->input); + int numCaptures() { + return _numCaptures; + } - // The first and second entries of the 'capturesBuffer' will have the start and (end+1) indices - // of the matched string, as byte offsets. '(limit - startIndex)' would be the length of the - // captured string. - const int matchStartByteIndex = regexState->capturesBuffer[0]; - StringData matchedStr = - input.substr(matchStartByteIndex, regexState->capturesBuffer[1] - matchStartByteIndex); + bool nullish() { + return _nullish; + } - // We iterate through the input string's contents preceding the match index, in order to convert - // the byte offset to a code point offset. - for (int byteIx = regexState->startBytePos; byteIx < matchStartByteIndex; - ++(regexState->startCodePointPos)) { - byteIx += getCodePointLength(input[byteIx]); + StringData getInput() { + return _input; } - // Set the start index for match to the new one. - regexState->startBytePos = matchStartByteIndex; - - std::vector<Value> captures; - captures.reserve(regexState->numCaptures); - - // The next '2 * numCaptures' entries (after the first two entries) of 'capturesBuffer' will - // hold the start index and limit pairs, for each of the capture groups. We skip the first two - // elements and start iteration from 3rd element so that we only construct the strings for - // capture groups. - for (int i = 0; i < regexState->numCaptures; ++i) { - const int start = regexState->capturesBuffer[2 * (i + 1)]; - const int limit = regexState->capturesBuffer[2 * (i + 1) + 1]; - captures.push_back(Value(input.substr(start, limit - start))); - } - - MutableDocument match; - match.addField("match", Value(matchedStr)); - match.addField("idx", Value(regexState->startCodePointPos)); - match.addField("captures", Value(captures)); - return match.freezeToValue(); -} - -void RegexMatchHandler::_compile(RegexExecutionState* executionState) const { - const auto pcreOptions = - regex_util::flags2PcreOptions(executionState->options, false).all_options(); - - if (!executionState->pattern) { - return; - } - const char* compile_error; - int eoffset; - - // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the index of - // the match). So we are using the C interface. First we compile all the regex options to - // generate pcre object, which will later be used to match against the input string. - executionState->pcre = std::shared_ptr<pcre>( - pcre_compile( - executionState->pattern->c_str(), pcreOptions, &compile_error, &eoffset, nullptr), - pcre_free); - uassert(51111, str::stream() << "Invalid Regex: " << compile_error, executionState->pcre); - - // Calculate the number of capture groups present in 'pattern' and store in 'numCaptures'. - const int pcre_retval = pcre_fullinfo( - executionState->pcre.get(), NULL, PCRE_INFO_CAPTURECOUNT, &executionState->numCaptures); - invariant(pcre_retval == 0); - - // The first two-thirds of the vector is used to pass back captured substrings' start and - // (end+1) indexes. The remaining third of the vector is used as workspace by pcre_exec() while - // matching capturing subpatterns, and is not available for passing back information. - // pcre_compile will error if there are too many capture groups in the pattern. As long as this - // memory is allocated after compile, the amount of memory allocated will not be too high. - executionState->capturesBuffer.resize((1 + executionState->numCaptures) * 3); -} - -void RegexMatchHandler::_extractInputField(RegexExecutionState* executionState, - const Value& textInput) const { - uassert(51104, - "'input' field should be of type string", - textInput.nullish() || textInput.getType() == BSONType::String); - if (textInput.getType() == BSONType::String) { - executionState->input = textInput.getString(); - } -} - -void RegexMatchHandler::_extractRegexAndOptions(RegexExecutionState* executionState, - const Value& regexPattern, - const Value& regexOptions) const { - uassert(51105, - "'regex' field should be of type string or regex", - regexPattern.nullish() || regexPattern.getType() == BSONType::String || - regexPattern.getType() == BSONType::RegEx); - uassert(51106, - "'options' should be of type string", - regexOptions.nullish() || regexOptions.getType() == BSONType::String); - - // The 'regex' field can be a RegEx object and may have its own options... - if (regexPattern.getType() == BSONType::RegEx) { - StringData regexFlags = regexPattern.getRegexFlags(); - executionState->pattern = regexPattern.getRegex(); - uassert( - 51107, - str::stream() << "Found regex option(s) specified in both 'regex' and 'option' fields", - regexOptions.nullish() || regexFlags.empty()); - if (!regexFlags.empty()) { - executionState->options = regexFlags.toString(); +private: + RegexMatchHandler(const RegexMatchHandler&) = delete; + + void _compile(const int pcreOptions) { + const char* compile_error; + int eoffset; + // The C++ interface pcreccp.h doesn't have a way to capture the matched string (or the + // index of the match). So we are using the C interface. First we compile all the regex + // options to generate pcre object, which will later be used to match against the input + // string. + _pcre = pcre_compile(_pattern.c_str(), pcreOptions, &compile_error, &eoffset, nullptr); + if (_pcre == nullptr) { + uasserted(51111, str::stream() << "Invalid Regex: " << compile_error); } - } else if (regexPattern.getType() == BSONType::String) { - // ...or it can be a string field with options specified separately. - executionState->pattern = regexPattern.getString(); - } - // If 'options' is non-null, we must extract and validate its contents even if 'regexPattern' is - // nullish. - if (!regexOptions.nullish()) { - executionState->options = regexOptions.getString(); - } - uassert(51109, - "Regular expression cannot contain an embedded null byte", - !executionState->pattern || executionState->pattern->find('\0', 0) == string::npos); - uassert(51110, - "Regular expression options string cannot contain an embedded null byte", - executionState->options.find('\0', 0) == string::npos); -} + // Calculate the number of capture groups present in '_pattern' and store in '_numCaptures'. + int pcre_retval = pcre_fullinfo(_pcre, NULL, PCRE_INFO_CAPTURECOUNT, &_numCaptures); + invariant(pcre_retval == 0); + + // The first two-thirds of the vector is used to pass back captured substrings' start and + // limit indexes. The remaining third of the vector is used as workspace by pcre_exec() + // while matching capturing subpatterns, and is not available for passing back information. + // pcre_compile will error if there are too many capture groups in the pattern. As long as + // this memory is allocated after compile, the amount of memory allocated will not be too + // high. + _capturesBuffer = std::vector<int>((1 + _numCaptures) * 3); + } + + void _validateInputAndExtractElements(const Value& inputExpr) { + uassert(51103, + str::stream() << "$regexFind expects an object of named arguments, but found type " + << inputExpr.getType(), + inputExpr.getType() == BSONType::Object); + Value textInput = inputExpr.getDocument().getField("input"); + Value regexPattern = inputExpr.getDocument().getField("regex"); + Value regexOptions = inputExpr.getDocument().getField("options"); + + uassert(51104, + "'input' field should be of type string", + textInput.nullish() || textInput.getType() == BSONType::String); + uassert(51105, + "'regex' field should be of type string or regex", + regexPattern.nullish() || regexPattern.getType() == BSONType::String || + regexPattern.getType() == BSONType::RegEx); + uassert(51106, + "'options' should be of type string", + regexOptions.nullish() || regexOptions.getType() == BSONType::String); + + // If either the text input or regex pattern is nullish, then we consider the operation as a + // whole nullish. + _nullish = textInput.nullish() || regexPattern.nullish(); + + if (textInput.getType() == BSONType::String) { + _input = textInput.getString(); + } -boost::intrusive_ptr<Expression> ExpressionRegexFind::optimize() { - _handler.optimize(vpOperand[0]); - return this; -} + // The 'regex' field can be a RegEx object and may have its own options... + if (regexPattern.getType() == BSONType::RegEx) { + StringData regexFlags = regexPattern.getRegexFlags(); + _pattern = regexPattern.getRegex(); + uassert(51107, + str::stream() + << "Found regex option(s) specified in both 'regex' and 'option' fields", + regexOptions.nullish() || regexFlags.empty()); + if (!regexFlags.empty()) { + _options = regexFlags.toString(); + } + } else if (regexPattern.getType() == BSONType::String) { + // ...or it can be a string field with options specified separately. + _pattern = regexPattern.getString(); + } + // If 'options' is non-null, we must extract and validate its contents even if + // 'regexPattern' is nullish. + if (!regexOptions.nullish()) { + _options = regexOptions.getString(); + } + uassert(51109, + "Regular expression cannot contain an embedded null byte", + _pattern.find('\0', 0) == string::npos); + uassert(51110, + "Regular expression options string cannot contain an embedded null byte", + _options.find('\0', 0) == string::npos); + } + + pcre* _pcre; + // Number of capture groups present in '_pattern'. + int _numCaptures; + // Holds the start and limit indices of match and captures for the current match. + std::vector<int> _capturesBuffer; + std::string _input; + std::string _pattern; + std::string _options; + bool _nullish; +}; + +} // namespace Value ExpressionRegexFind::evaluate(const Document& root) const { - auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root)); - if (executionState.nullish()) { + + RegexMatchHandler regex(vpOperand[0]->evaluate(root)); + if (regex.nullish()) { return Value(BSONNULL); } - return _handler.nextMatch(&executionState); + int startByteIndex = 0, startCodePointIndex = 0; + return regex.nextMatch(&startByteIndex, &startCodePointIndex); } REGISTER_EXPRESSION(regexFind, ExpressionRegexFind::parse); @@ -5916,24 +5883,21 @@ const char* ExpressionRegexFind::getOpName() const { return "$regexFind"; } -boost::intrusive_ptr<Expression> ExpressionRegexFindAll::optimize() { - _handler.optimize(vpOperand[0]); - return this; -} - Value ExpressionRegexFindAll::evaluate(const Document& root) const { + std::vector<Value> output; - auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root)); - if (executionState.nullish()) { + RegexMatchHandler regex(vpOperand[0]->evaluate(root)); + if (regex.nullish()) { return Value(output); } - StringData input = *(executionState.input); + int startByteIndex = 0, startCodePointIndex = 0; + StringData input = regex.getInput(); size_t totalDocSize = 0; // Using do...while loop because, when input is an empty string, we still want to see if there // is a match. do { - auto matchObj = _handler.nextMatch(&executionState); + auto matchObj = regex.nextMatch(&startByteIndex, &startCodePointIndex); if (matchObj.getType() == BSONType::jstNULL) { break; } @@ -5949,22 +5913,19 @@ Value ExpressionRegexFindAll::evaluate(const Document& root) const { // the character at startByteIndex matches the regex, we cannot return it since we are // already returing an empty string starting at this index. So we move on to the next // byte index. - executionState.startBytePos += getCodePointLength(input[executionState.startBytePos]); - ++executionState.startCodePointPos; + startByteIndex += getCodePointLength(input[startByteIndex]); + ++startCodePointIndex; continue; } - - // We don't want any overlapping sub-strings. So we move 'startBytePos' to point to the + // We don't want any overlapping sub-strings. So we move 'startByteIndex' to point to the // byte after 'matchStr'. We move the code point index also correspondingly. - executionState.startBytePos += matchStr.size(); - for (size_t byteIx = 0; byteIx < matchStr.size(); ++executionState.startCodePointPos) { + startByteIndex += matchStr.size(); + for (size_t byteIx = 0; byteIx < matchStr.size(); ++startCodePointIndex) { byteIx += getCodePointLength(matchStr[byteIx]); } - - invariant(executionState.startBytePos > 0); - invariant(executionState.startCodePointPos > 0); - invariant(executionState.startCodePointPos <= executionState.startBytePos); - } while (static_cast<size_t>(executionState.startBytePos) < input.size()); + invariant(startByteIndex > 0 && startCodePointIndex > 0 && + startCodePointIndex <= startByteIndex); + } while (static_cast<size_t>(startByteIndex) < input.size()); return Value(output); } @@ -5973,15 +5934,10 @@ const char* ExpressionRegexFindAll::getOpName() const { return "$regexFindAll"; } -boost::intrusive_ptr<Expression> ExpressionRegexMatch::optimize() { - _handler.optimize(vpOperand[0]); - return this; -} - Value ExpressionRegexMatch::evaluate(const Document& root) const { - auto executionState = _handler.buildInitialState(vpOperand[0]->evaluate(root)); + RegexMatchHandler regex(vpOperand[0]->evaluate(root)); // Return output of execute only if regex is not nullish. - return executionState.nullish() ? Value(false) : Value(_handler.execute(&executionState) > 0); + return regex.nullish() ? Value(false) : Value(regex.execute(0) > 0); } REGISTER_EXPRESSION(regexMatch, ExpressionRegexMatch::parse); |