SERVER-68792 Refactor iterating over cell values into a cursor

author: Irina Yatsenko <irina.yatsenko@mongodb.com> 2022-09-23 15:58:23 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-09-23 17:00:51 +0000
commit: 02c78ac52fd67148b499b0cbf0f422a50bd92943 (patch)
tree: d74689c80944e87bf8c97bfc460a5d7d1cd0a327
parent: d22d68c4c2fcdc76fcd0079a12e9f5e39b119381 (diff)
download: mongo-02c78ac52fd67148b499b0cbf0f422a50bd92943.tar.gz
3 files changed, 163 insertions, 144 deletions
diff --git a/src/mongo/db/exec/sbe/stages/column_scan.cpp b/src/mongo/db/exec/sbe/stages/column_scan.cpp
index 640894aa33b..0585296ed23 100644
--- a/src/mongo/db/exec/sbe/stages/column_scan.cpp
+++ b/src/mongo/db/exec/sbe/stages/column_scan.cpp
@@ -354,106 +354,22 @@ void ColumnScanStage::readParentsIntoObj(StringData path,
 
 // The result of the filter predicate will be the same regardless of sparseness or sub objects,
 // therefore we don't look at the parents and don't consult the row store.
-// (TODO SERVER-68792) Refactor the iteration over values into its own type.
 bool ColumnScanStage::checkFilter(CellView cell, size_t filterIndex, const PathValue& path) {
     auto splitCellView = SplitCellView::parse(cell);
-    auto translatedCell = translateCell(path, splitCellView);
 
-    if (!translatedCell.moreValues()) {
-        return false;
-    }
+    SplitCellView::CursorWithArrayDepth<value::ColumnStoreEncoder> cellCursor{
+        splitCellView.firstValuePtr, splitCellView.arrInfo, &_encoder};
 
-    if (translatedCell.arrInfo.empty()) {
-        // Have a single non-nested value -- evaluate the filter on it.
-        // (TODO SERVER-68792) Could we avoid copying by using a non-owning accessor? Same question
-        // for other locations in this function when the predicate is evaluated immediately after
-        // setting the slot.
-        auto [tag, val] = translatedCell.nextValue();
-        _filterInputAccessors[filterIndex].reset(tag, val);
-        return _bytecode.runPredicate(_filterExprsCode[filterIndex].get());
-    } else {
-        ArrInfoReader arrInfoReader{translatedCell.arrInfo};
-        int depth = 0;
-
-        // Avoid allocating memory for this stack except in the rare case of deeply nested
-        // documents.
-        std::stack<bool, absl::InlinedVector<bool, 64>> inArray;
-        while (arrInfoReader.moreExplicitComponents()) {
-            switch (arrInfoReader.takeNextChar()) {
-                case '{': {
-                    // We consider as nested only the arrays that are elements of other arrays. When
-                    // there is an array of objects and some of the fields of these objects are
-                    // arrays, the latter aren't nested.
-                    inArray.push(false);
-                    break;
-                }
-                case '[': {
-                    // A '[' can be followed by a number if there are objects in the array, that
-                    // should be retrieved from other paths when reconstructing the record. We can
-                    // ignore them as they don't contribute to the values.
-                    (void)arrInfoReader.takeNumber();
-                    if (!inArray.empty() && inArray.top()) {
-                        depth++;
-                    }
-                    inArray.push(true);
-                    break;
-                }
-                case '+': {
-                    // Indicates elements in arrays that are objects that don't have the path. These
-                    // objects don't contribute to the cell's values, so we can ignore them.
-                    (void)arrInfoReader.takeNumber();
-                    break;
-                }
-                case '|': {
-                    auto repeats = arrInfoReader.takeNumber();
-                    for (size_t i = 0; i < repeats + 1; i++) {
-                        auto [tag, val] = translatedCell.nextValue();
-                        if (depth == 0) {
-                            _filterInputAccessors[filterIndex].reset(tag, val);
-                            if (_bytecode.runPredicate(_filterExprsCode[filterIndex].get())) {
-                                return true;
-                            }
-                        }
-                    }
-                    break;
-                }
-                case 'o': {
-                    // Indicates the start of a nested object inside the cell. We don't need to
-                    // track this info because the nested objects don't contribute to the values in
-                    // the cell.
-                    (void)arrInfoReader.takeNumber();
-                    break;
-                }
-                case ']': {
-                    invariant(inArray.size() > 0 && inArray.top());
-                    inArray.pop();
-                    if (inArray.size() > 0 && inArray.top()) {
-                        invariant(depth > 0);
-                        depth--;
-                    }
-
-                    // Closing an array implicitly closes all objects on the path between it and the
-                    // previous array.
-                    while (inArray.size() > 0 && !inArray.top()) {
-                        inArray.pop();
-                    }
-                }
-            }
-        }
-        if (depth == 0) {
-            // For the remaining values the depth isn't going to change so we don't need to advance
-            // the value iterator if the values are too deep.
-            while (translatedCell.moreValues()) {
-                auto [tag, val] = translatedCell.nextValue();
-                _filterInputAccessors[filterIndex].reset(tag, val);
-                if (_bytecode.runPredicate(_filterExprsCode[filterIndex].get())) {
-                    return true;
-                }
-            }
+    while (cellCursor.hasNext()) {
+        auto [val, depth] = cellCursor.nextValue();
+        if (depth > 0)
+            continue;
+
+        _filterInputAccessors[filterIndex].reset(val->first, val->second);
+        if (_bytecode.runPredicate(_filterExprsCode[filterIndex].get())) {
+            return true;
         }
     }
-
-    // None of the values matched the filter.
     return false;
 }
 
diff --git a/src/mongo/db/exec/sbe/values/columnar.h b/src/mongo/db/exec/sbe/values/columnar.h
index 01aef2acbce..ec0fb5f07c8 100644
--- a/src/mongo/db/exec/sbe/values/columnar.h
+++ b/src/mongo/db/exec/sbe/values/columnar.h
@@ -39,56 +39,6 @@
  */
 
 namespace mongo::sbe {
-/*
- * Array info reader based on a string representation of arrayInfo. Allows for reading/peeking of
- * individual components.
- */
-class ArrInfoReader {
-public:
-    explicit ArrInfoReader(StringData arrInfoStr) : _arrInfo(arrInfoStr) {}
-
-    char nextChar() const {
-        if (_offsetInArrInfo == _arrInfo.size()) {
-            // Reaching the end of the array info means an unlimited number of '|'s.
-            return '|';
-        }
-        return _arrInfo[_offsetInArrInfo];
-    }
-
-    char takeNextChar() {
-        if (_offsetInArrInfo == _arrInfo.size()) {
-            // Reaching the end of the array info means an unlimited number of '|'s.
-            return '|';
-        }
-        return _arrInfo[_offsetInArrInfo++];
-    }
-
-    size_t takeNumber() {
-        return ColumnStore::readArrInfoNumber(_arrInfo, &_offsetInArrInfo);
-    }
-
-    bool empty() const {
-        return _arrInfo.empty();
-    }
-
-    /*
-     * Returns whether more explicit components are yet to be consumed. Since array info logically
-     * ends with an infinite stream of |, this function indicates whether there are more components
-     * which are physically present to be read, not including the infinite sequence of |.
-     */
-    bool moreExplicitComponents() const {
-        return _offsetInArrInfo < _arrInfo.size();
-    }
-
-    StringData rawArrInfo() const {
-        return _arrInfo;
-    }
-
-private:
-    StringData _arrInfo;
-    size_t _offsetInArrInfo = 0;
-};
-
 /**
  * Represents a cell in a columnar index with ability to retrieve values in an SBE native format.
  */
diff --git a/src/mongo/db/storage/column_store.h b/src/mongo/db/storage/column_store.h
index 995b7e15755..9db8ca7d1bb 100644
--- a/src/mongo/db/storage/column_store.h
+++ b/src/mongo/db/storage/column_store.h
@@ -30,6 +30,7 @@
 #pragma once
 
 #include <boost/optional/optional.hpp>
+#include <stack>
 
 #include "mongo/base/string_data.h"
 #include "mongo/bson/bsonelement.h"
@@ -382,6 +383,56 @@ protected:
     std::shared_ptr<Ident> _ident;
 };
 
+/*
+ * Array info reader based on a string representation of arrayInfo. Allows for reading/peeking of
+ * individual components.
+ */
+class ArrInfoReader {
+public:
+    explicit ArrInfoReader(StringData arrInfoStr) : _arrInfo(arrInfoStr) {}
+
+    char nextChar() const {
+        if (_offsetInArrInfo == _arrInfo.size()) {
+            // Reaching the end of the array info means an unlimited number of '|'s.
+            return '|';
+        }
+        return _arrInfo[_offsetInArrInfo];
+    }
+
+    char takeNextChar() {
+        if (_offsetInArrInfo == _arrInfo.size()) {
+            // Reaching the end of the array info means an unlimited number of '|'s.
+            return '|';
+        }
+        return _arrInfo[_offsetInArrInfo++];
+    }
+
+    size_t takeNumber() {
+        return ColumnStore::readArrInfoNumber(_arrInfo, &_offsetInArrInfo);
+    }
+
+    bool empty() const {
+        return _arrInfo.empty();
+    }
+
+    /*
+     * Returns whether more explicit components are yet to be consumed. Since array info logically
+     * ends with an infinite stream of |, this function indicates whether there are more components
+     * which are physically present to be read, not including the infinite sequence of |.
+     */
+    bool moreExplicitComponents() const {
+        return _offsetInArrInfo < _arrInfo.size();
+    }
+
+    StringData rawArrInfo() const {
+        return _arrInfo;
+    }
+
+private:
+    StringData _arrInfo;
+    size_t _offsetInArrInfo = 0;
+};
+
 struct SplitCellView {
     StringData arrInfo;  // rawData() is 1-past-end of range starting with firstValuePtr.
     const char* firstValuePtr = nullptr;
@@ -423,6 +474,108 @@ struct SplitCellView {
         return Cursor<ValueEncoder>{firstValuePtr, arrInfo.rawData(), valEncoder};
     }
 
+    template <class ValueEncoder>
+    struct CursorWithArrayDepth {
+        CursorWithArrayDepth(const char* elemPtr,
+                             const StringData& arrayInfo,
+                             ValueEncoder* encoder)
+            : elemPtr(elemPtr),
+              end(arrayInfo.rawData()),
+              encoder(encoder),
+              arrInfoReader(arrayInfo) {}
+
+        bool hasNext() const {
+            return elemPtr < end;
+        }
+
+        using Out = typename std::remove_reference_t<ValueEncoder>::Out;
+        /**
+         * Returns the next value with its array-nestedness level.
+         */
+        std::pair<Out /*decoded value*/, int /*depth*/> nextValue() {
+            invariant(elemPtr < end, "The client should have checked 'hasNext()'");
+
+            // The expected most common case: implicit tail of values at the same depth.
+            if (!arrInfoReader.moreExplicitComponents()) {
+                return {decodeAndAdvance(elemPtr, *encoder), depth};
+            }
+
+            // The next expected most common case: a range of values at the same depth.
+            if (repeats > 0) {
+                repeats--;
+                return {decodeAndAdvance(elemPtr, *encoder), depth};
+            }
+
+            // An end of a range means we have to check for structural changes and update depth.
+            while (arrInfoReader.moreExplicitComponents()) {
+                switch (arrInfoReader.takeNextChar()) {
+                    case '[': {
+                        // A '[' can be followed by a number if there are objects in the array,
+                        // that should be retrieved from other paths when reconstructing the
+                        // record. We can ignore them as they don't contribute to the values.
+                        (void)arrInfoReader.takeNumber();
+                        if (!inArray.empty() && inArray.top()) {
+                            depth++;
+                        }
+                        inArray.push(true);
+                        break;
+                    }
+                    case '|': {
+                        repeats = arrInfoReader.takeNumber();
+                        return {decodeAndAdvance(elemPtr, *encoder), depth};
+                    }
+                    case '{': {
+                        // We consider as nested only the arrays that are elements of other
+                        // arrays. When there is an array of objects and some of the fields of
+                        // these objects are arrays, the latter aren't nested.
+                        inArray.push(false);
+                        break;
+                    }
+                    case ']': {
+                        invariant(inArray.size() > 0 && inArray.top());
+                        inArray.pop();
+                        if (inArray.size() > 0 && inArray.top()) {
+                            invariant(depth > 0);
+                            depth--;
+                        }
+
+                        // Closing an array implicitly closes all objects on the path between it
+                        // and the previous array.
+                        while (inArray.size() > 0 && !inArray.top()) {
+                            inArray.pop();
+                        }
+                        break;
+                    }
+                    case '+': {
+                        // Indicates elements in arrays that are objects that don't have the
+                        // path. These objects don't contribute to the cell's values, so we can
+                        // ignore them.
+                        (void)arrInfoReader.takeNumber();
+                        break;
+                    }
+                    case 'o': {
+                        // Indicates the start of a nested object inside the cell. We don't need
+                        // to track this info because the nested objects don't contribute to the
+                        // values in the cell.
+                        (void)arrInfoReader.takeNumber();
+                        break;
+                    }
+                }
+            }
+
+            // Start consuming the implicit tail range.
+            return {decodeAndAdvance(elemPtr, *encoder), depth};
+        }
+
+        const char* elemPtr;
+        const char* end;
+        ValueEncoder* encoder;
+        ArrInfoReader arrInfoReader;
+        int depth = 0;
+        std::stack<bool, absl::InlinedVector<bool, 64>> inArray;
+        size_t repeats = 0;
+    };
+
     static SplitCellView parse(CellView cell) {
         using Bytes = ColumnStore::Bytes;
         using TinySize = ColumnStore::Bytes::TinySize;
author	Irina Yatsenko <irina.yatsenko@mongodb.com>	2022-09-23 15:58:23 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-09-23 17:00:51 +0000
commit	02c78ac52fd67148b499b0cbf0f422a50bd92943 (patch)
tree	d74689c80944e87bf8c97bfc460a5d7d1cd0a327
parent	d22d68c4c2fcdc76fcd0079a12e9f5e39b119381 (diff)
download	mongo-02c78ac52fd67148b499b0cbf0f422a50bd92943.tar.gz