diff options
author | Irina Yatsenko <irina.yatsenko@mongodb.com> | 2022-09-23 15:58:23 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-09-23 17:00:51 +0000 |
commit | 02c78ac52fd67148b499b0cbf0f422a50bd92943 (patch) | |
tree | d74689c80944e87bf8c97bfc460a5d7d1cd0a327 | |
parent | d22d68c4c2fcdc76fcd0079a12e9f5e39b119381 (diff) | |
download | mongo-02c78ac52fd67148b499b0cbf0f422a50bd92943.tar.gz |
SERVER-68792 Refactor iterating over cell values into a cursor
-rw-r--r-- | src/mongo/db/exec/sbe/stages/column_scan.cpp | 104 | ||||
-rw-r--r-- | src/mongo/db/exec/sbe/values/columnar.h | 50 | ||||
-rw-r--r-- | src/mongo/db/storage/column_store.h | 153 |
3 files changed, 163 insertions, 144 deletions
diff --git a/src/mongo/db/exec/sbe/stages/column_scan.cpp b/src/mongo/db/exec/sbe/stages/column_scan.cpp index 640894aa33b..0585296ed23 100644 --- a/src/mongo/db/exec/sbe/stages/column_scan.cpp +++ b/src/mongo/db/exec/sbe/stages/column_scan.cpp @@ -354,106 +354,22 @@ void ColumnScanStage::readParentsIntoObj(StringData path, // The result of the filter predicate will be the same regardless of sparseness or sub objects, // therefore we don't look at the parents and don't consult the row store. -// (TODO SERVER-68792) Refactor the iteration over values into its own type. bool ColumnScanStage::checkFilter(CellView cell, size_t filterIndex, const PathValue& path) { auto splitCellView = SplitCellView::parse(cell); - auto translatedCell = translateCell(path, splitCellView); - if (!translatedCell.moreValues()) { - return false; - } + SplitCellView::CursorWithArrayDepth<value::ColumnStoreEncoder> cellCursor{ + splitCellView.firstValuePtr, splitCellView.arrInfo, &_encoder}; - if (translatedCell.arrInfo.empty()) { - // Have a single non-nested value -- evaluate the filter on it. - // (TODO SERVER-68792) Could we avoid copying by using a non-owning accessor? Same question - // for other locations in this function when the predicate is evaluated immediately after - // setting the slot. - auto [tag, val] = translatedCell.nextValue(); - _filterInputAccessors[filterIndex].reset(tag, val); - return _bytecode.runPredicate(_filterExprsCode[filterIndex].get()); - } else { - ArrInfoReader arrInfoReader{translatedCell.arrInfo}; - int depth = 0; - - // Avoid allocating memory for this stack except in the rare case of deeply nested - // documents. - std::stack<bool, absl::InlinedVector<bool, 64>> inArray; - while (arrInfoReader.moreExplicitComponents()) { - switch (arrInfoReader.takeNextChar()) { - case '{': { - // We consider as nested only the arrays that are elements of other arrays. When - // there is an array of objects and some of the fields of these objects are - // arrays, the latter aren't nested. - inArray.push(false); - break; - } - case '[': { - // A '[' can be followed by a number if there are objects in the array, that - // should be retrieved from other paths when reconstructing the record. We can - // ignore them as they don't contribute to the values. - (void)arrInfoReader.takeNumber(); - if (!inArray.empty() && inArray.top()) { - depth++; - } - inArray.push(true); - break; - } - case '+': { - // Indicates elements in arrays that are objects that don't have the path. These - // objects don't contribute to the cell's values, so we can ignore them. - (void)arrInfoReader.takeNumber(); - break; - } - case '|': { - auto repeats = arrInfoReader.takeNumber(); - for (size_t i = 0; i < repeats + 1; i++) { - auto [tag, val] = translatedCell.nextValue(); - if (depth == 0) { - _filterInputAccessors[filterIndex].reset(tag, val); - if (_bytecode.runPredicate(_filterExprsCode[filterIndex].get())) { - return true; - } - } - } - break; - } - case 'o': { - // Indicates the start of a nested object inside the cell. We don't need to - // track this info because the nested objects don't contribute to the values in - // the cell. - (void)arrInfoReader.takeNumber(); - break; - } - case ']': { - invariant(inArray.size() > 0 && inArray.top()); - inArray.pop(); - if (inArray.size() > 0 && inArray.top()) { - invariant(depth > 0); - depth--; - } - - // Closing an array implicitly closes all objects on the path between it and the - // previous array. - while (inArray.size() > 0 && !inArray.top()) { - inArray.pop(); - } - } - } - } - if (depth == 0) { - // For the remaining values the depth isn't going to change so we don't need to advance - // the value iterator if the values are too deep. - while (translatedCell.moreValues()) { - auto [tag, val] = translatedCell.nextValue(); - _filterInputAccessors[filterIndex].reset(tag, val); - if (_bytecode.runPredicate(_filterExprsCode[filterIndex].get())) { - return true; - } - } + while (cellCursor.hasNext()) { + auto [val, depth] = cellCursor.nextValue(); + if (depth > 0) + continue; + + _filterInputAccessors[filterIndex].reset(val->first, val->second); + if (_bytecode.runPredicate(_filterExprsCode[filterIndex].get())) { + return true; } } - - // None of the values matched the filter. return false; } diff --git a/src/mongo/db/exec/sbe/values/columnar.h b/src/mongo/db/exec/sbe/values/columnar.h index 01aef2acbce..ec0fb5f07c8 100644 --- a/src/mongo/db/exec/sbe/values/columnar.h +++ b/src/mongo/db/exec/sbe/values/columnar.h @@ -39,56 +39,6 @@ */ namespace mongo::sbe { -/* - * Array info reader based on a string representation of arrayInfo. Allows for reading/peeking of - * individual components. - */ -class ArrInfoReader { -public: - explicit ArrInfoReader(StringData arrInfoStr) : _arrInfo(arrInfoStr) {} - - char nextChar() const { - if (_offsetInArrInfo == _arrInfo.size()) { - // Reaching the end of the array info means an unlimited number of '|'s. - return '|'; - } - return _arrInfo[_offsetInArrInfo]; - } - - char takeNextChar() { - if (_offsetInArrInfo == _arrInfo.size()) { - // Reaching the end of the array info means an unlimited number of '|'s. - return '|'; - } - return _arrInfo[_offsetInArrInfo++]; - } - - size_t takeNumber() { - return ColumnStore::readArrInfoNumber(_arrInfo, &_offsetInArrInfo); - } - - bool empty() const { - return _arrInfo.empty(); - } - - /* - * Returns whether more explicit components are yet to be consumed. Since array info logically - * ends with an infinite stream of |, this function indicates whether there are more components - * which are physically present to be read, not including the infinite sequence of |. - */ - bool moreExplicitComponents() const { - return _offsetInArrInfo < _arrInfo.size(); - } - - StringData rawArrInfo() const { - return _arrInfo; - } - -private: - StringData _arrInfo; - size_t _offsetInArrInfo = 0; -}; - /** * Represents a cell in a columnar index with ability to retrieve values in an SBE native format. */ diff --git a/src/mongo/db/storage/column_store.h b/src/mongo/db/storage/column_store.h index 995b7e15755..9db8ca7d1bb 100644 --- a/src/mongo/db/storage/column_store.h +++ b/src/mongo/db/storage/column_store.h @@ -30,6 +30,7 @@ #pragma once #include <boost/optional/optional.hpp> +#include <stack> #include "mongo/base/string_data.h" #include "mongo/bson/bsonelement.h" @@ -382,6 +383,56 @@ protected: std::shared_ptr<Ident> _ident; }; +/* + * Array info reader based on a string representation of arrayInfo. Allows for reading/peeking of + * individual components. + */ +class ArrInfoReader { +public: + explicit ArrInfoReader(StringData arrInfoStr) : _arrInfo(arrInfoStr) {} + + char nextChar() const { + if (_offsetInArrInfo == _arrInfo.size()) { + // Reaching the end of the array info means an unlimited number of '|'s. + return '|'; + } + return _arrInfo[_offsetInArrInfo]; + } + + char takeNextChar() { + if (_offsetInArrInfo == _arrInfo.size()) { + // Reaching the end of the array info means an unlimited number of '|'s. + return '|'; + } + return _arrInfo[_offsetInArrInfo++]; + } + + size_t takeNumber() { + return ColumnStore::readArrInfoNumber(_arrInfo, &_offsetInArrInfo); + } + + bool empty() const { + return _arrInfo.empty(); + } + + /* + * Returns whether more explicit components are yet to be consumed. Since array info logically + * ends with an infinite stream of |, this function indicates whether there are more components + * which are physically present to be read, not including the infinite sequence of |. + */ + bool moreExplicitComponents() const { + return _offsetInArrInfo < _arrInfo.size(); + } + + StringData rawArrInfo() const { + return _arrInfo; + } + +private: + StringData _arrInfo; + size_t _offsetInArrInfo = 0; +}; + struct SplitCellView { StringData arrInfo; // rawData() is 1-past-end of range starting with firstValuePtr. const char* firstValuePtr = nullptr; @@ -423,6 +474,108 @@ struct SplitCellView { return Cursor<ValueEncoder>{firstValuePtr, arrInfo.rawData(), valEncoder}; } + template <class ValueEncoder> + struct CursorWithArrayDepth { + CursorWithArrayDepth(const char* elemPtr, + const StringData& arrayInfo, + ValueEncoder* encoder) + : elemPtr(elemPtr), + end(arrayInfo.rawData()), + encoder(encoder), + arrInfoReader(arrayInfo) {} + + bool hasNext() const { + return elemPtr < end; + } + + using Out = typename std::remove_reference_t<ValueEncoder>::Out; + /** + * Returns the next value with its array-nestedness level. + */ + std::pair<Out /*decoded value*/, int /*depth*/> nextValue() { + invariant(elemPtr < end, "The client should have checked 'hasNext()'"); + + // The expected most common case: implicit tail of values at the same depth. + if (!arrInfoReader.moreExplicitComponents()) { + return {decodeAndAdvance(elemPtr, *encoder), depth}; + } + + // The next expected most common case: a range of values at the same depth. + if (repeats > 0) { + repeats--; + return {decodeAndAdvance(elemPtr, *encoder), depth}; + } + + // An end of a range means we have to check for structural changes and update depth. + while (arrInfoReader.moreExplicitComponents()) { + switch (arrInfoReader.takeNextChar()) { + case '[': { + // A '[' can be followed by a number if there are objects in the array, + // that should be retrieved from other paths when reconstructing the + // record. We can ignore them as they don't contribute to the values. + (void)arrInfoReader.takeNumber(); + if (!inArray.empty() && inArray.top()) { + depth++; + } + inArray.push(true); + break; + } + case '|': { + repeats = arrInfoReader.takeNumber(); + return {decodeAndAdvance(elemPtr, *encoder), depth}; + } + case '{': { + // We consider as nested only the arrays that are elements of other + // arrays. When there is an array of objects and some of the fields of + // these objects are arrays, the latter aren't nested. + inArray.push(false); + break; + } + case ']': { + invariant(inArray.size() > 0 && inArray.top()); + inArray.pop(); + if (inArray.size() > 0 && inArray.top()) { + invariant(depth > 0); + depth--; + } + + // Closing an array implicitly closes all objects on the path between it + // and the previous array. + while (inArray.size() > 0 && !inArray.top()) { + inArray.pop(); + } + break; + } + case '+': { + // Indicates elements in arrays that are objects that don't have the + // path. These objects don't contribute to the cell's values, so we can + // ignore them. + (void)arrInfoReader.takeNumber(); + break; + } + case 'o': { + // Indicates the start of a nested object inside the cell. We don't need + // to track this info because the nested objects don't contribute to the + // values in the cell. + (void)arrInfoReader.takeNumber(); + break; + } + } + } + + // Start consuming the implicit tail range. + return {decodeAndAdvance(elemPtr, *encoder), depth}; + } + + const char* elemPtr; + const char* end; + ValueEncoder* encoder; + ArrInfoReader arrInfoReader; + int depth = 0; + std::stack<bool, absl::InlinedVector<bool, 64>> inArray; + size_t repeats = 0; + }; + static SplitCellView parse(CellView cell) { using Bytes = ColumnStore::Bytes; using TinySize = ColumnStore::Bytes::TinySize; |