diff options
Diffstat (limited to 'src/mongo/db/cst/bson_lexer.cpp')
-rw-r--r-- | src/mongo/db/cst/bson_lexer.cpp | 142 |
1 files changed, 92 insertions, 50 deletions
diff --git a/src/mongo/db/cst/bson_lexer.cpp b/src/mongo/db/cst/bson_lexer.cpp index cb4c8b10109..0f15a7eccef 100644 --- a/src/mongo/db/cst/bson_lexer.cpp +++ b/src/mongo/db/cst/bson_lexer.cpp @@ -37,66 +37,108 @@ namespace mongo { namespace { -// Structure to annotate reserved tokens, used to hint the lexer into behavior once the token is -// seen. There are no semantic values associated with these tokens. -struct KeywordToken { - boost::optional<PipelineParserGen::token_type> type; - bool traverseChild; - bool orderedArguments; -}; - -// Mapping of reserved keywords to BSON token. Any key which is not included in this map is treated -// as a terminal. -const StringMap<KeywordToken> reservedKeyLookup = { - {"$_internalInhibitOptimization", - { - PipelineParserGen::token::STAGE_INHIBIT_OPTIMIZATION, - false /* traverseChild */, - false /* orderedArguments */ - }}, +// Mapping of reserved keywords to BSON token. Any key which is not included in this map is assumed +// to be a user field name and is treated as a terminal by the parser. +const StringMap<PipelineParserGen::token_type> reservedKeyLookup = { + {"$_internalInhibitOptimization", PipelineParserGen::token::STAGE_INHIBIT_OPTIMIZATION}, + {"$unionWith", PipelineParserGen::token::STAGE_UNION_WITH}, + {"coll", PipelineParserGen::token::COLL_ARG}, + {"pipeline", PipelineParserGen::token::PIPELINE_ARG}, }; +bool isCompound(PipelineParserGen::symbol_type token) { + return token.type_get() == static_cast<int>(PipelineParserGen::token::START_OBJECT) || + token.type_get() == static_cast<int>(PipelineParserGen::token::START_ARRAY); +} } // namespace -void BSONLexer::tokenize(BSONElement elem, bool includeFieldName) { - // The rules for tokenizing element field names fall into two buckets: - // 1. It matches a reserved keyword or operator. In this case, the traversal rules and - // sensitivity to ordered nested arguments is dictated by the annotation in the lookup - // table. - // 2. It is not a reserved keyword and thus is treated as a STRING literal token. The - // traversal rules for such elements is always to traverse but ignore argument order if the - // value is an object. - KeywordToken fieldNameToken = [&] { - if (includeFieldName) { - if (auto it = reservedKeyLookup.find(elem.fieldNameStringData()); - it != reservedKeyLookup.end()) { - _tokens.emplace_back(*it->second.type, getNextLoc()); - return it->second; - } else { - _tokens.emplace_back( - PipelineParserGen::make_STRING(elem.fieldName(), getNextLoc())); - return KeywordToken{PipelineParserGen::token::STRING, true, false}; +void BSONLexer::sortObjTokens() { + // A TokenElement is similar to a BSONElement, with the payload being a vector of Bison symbols + // if the type is compound (object or array). + using TokenElement = + std::pair<PipelineParserGen::symbol_type, std::vector<PipelineParserGen::symbol_type>>; + struct TokenElementCompare { + bool operator()(const TokenElement& elem1, const TokenElement& elem2) const { + return elem1.first.type_get() < elem2.first.type_get(); + } + }; + + auto currentPosition = _position; + if (_tokens[currentPosition].type_get() != + static_cast<int>(PipelineParserGen::token::START_OBJECT)) { + return; + } + + std::list<TokenElement> sortedTokenPairs; + // Increment to get to the first token after the START_OBJECT. We will sort tokens until the + // matching END_OBJECT is found. + currentPosition++; + while (_tokens[currentPosition].type_get() != + static_cast<int>(PipelineParserGen::token::END_OBJECT)) { + invariant(size_t(currentPosition) < _tokens.size()); + + auto keyToken = _tokens[currentPosition++]; + + std::vector<PipelineParserGen::symbol_type> rhsTokens; + rhsTokens.push_back(_tokens[currentPosition]); + if (isCompound(_tokens[currentPosition])) { + auto braceCount = 1; + currentPosition++; + // Only sort the top level tokens. If we encounter a compound type, then jump to its + // matching bracket or brace. + while (braceCount > 0) { + if (isCompound(_tokens[currentPosition])) + braceCount++; + if (_tokens[currentPosition].type_get() == + static_cast<int>(PipelineParserGen::token::END_OBJECT) || + _tokens[currentPosition].type_get() == + static_cast<int>(PipelineParserGen::token::END_ARRAY)) + braceCount--; + + rhsTokens.push_back(_tokens[currentPosition++]); } + } else { + // Scalar, already added above. + currentPosition++; + } + sortedTokenPairs.push_back(std::make_pair(keyToken, rhsTokens)); + } + sortedTokenPairs.sort(TokenElementCompare()); + + // _position is at the initial START_OBJECT, and currentPosition is at its matching + // END_OBJECT. We need to flatten the sorted list of KV pairs to get the correct order of + // tokens. + auto replacePosition = _position + 1; + for (auto&& [key, rhsTokens] : sortedTokenPairs) { + _tokens[replacePosition].clear(); + _tokens[replacePosition++].move(key); + for (auto&& token : rhsTokens) { + _tokens[replacePosition].clear(); + _tokens[replacePosition++].move(token); + } + } +} + +void BSONLexer::tokenize(BSONElement elem, bool includeFieldName) { + // Skipped when we are tokenizing arrays. + if (includeFieldName) { + if (auto it = reservedKeyLookup.find(elem.fieldNameStringData()); + it != reservedKeyLookup.end()) { + // Place the token expected by the parser if this is a reserved keyword. + _tokens.emplace_back(it->second, getNextLoc()); + } else { + // If we don't care about the keyword, the fieldname is treated as a normal string. + _tokens.emplace_back(PipelineParserGen::make_STRING(elem.fieldName(), getNextLoc())); } - return KeywordToken{boost::none, true, false}; - }(); + } switch (elem.type()) { case BSONType::Object: - if (fieldNameToken.orderedArguments) { - _tokens.emplace_back(PipelineParserGen::token::START_ORDERED_OBJECT, getNextLoc()); - BSONObjIteratorSorted sortedIt(elem.embeddedObject()); - while (sortedIt.more()) { - tokenize(sortedIt.next(), true); - } - _tokens.emplace_back(PipelineParserGen::token::END_ORDERED_OBJECT, getNextLoc()); - } else { - _tokens.emplace_back(PipelineParserGen::token::START_OBJECT, getNextLoc()); - for (auto&& nestedElem : elem.embeddedObject()) { - tokenize(nestedElem, true); - } - _tokens.emplace_back(PipelineParserGen::token::END_OBJECT, getNextLoc()); + _tokens.emplace_back(PipelineParserGen::token::START_OBJECT, getNextLoc()); + for (auto&& nestedElem : elem.embeddedObject()) { + tokenize(nestedElem, true); } + _tokens.emplace_back(PipelineParserGen::token::END_OBJECT, getNextLoc()); break; case BSONType::Array: _tokens.emplace_back(PipelineParserGen::token::START_ARRAY, getNextLoc()); |