From 3cc20216a850af1d4bf63956740d73e8fc3779df Mon Sep 17 00:00:00 2001
From: Dan Larkin-York <dan.larkin-york@mongodb.com>
Date: Tue, 5 Jul 2022 21:33:57 +0000
Subject: SERVER-67622 Optimize timeseries_dotted_path_support functions and
 fix array handling

---
 .../timeseries/timeseries_dotted_path_support.cpp  | 306 +++++++++++++--------
 .../timeseries_dotted_path_support_test.cpp        |  20 +-
 2 files changed, 209 insertions(+), 117 deletions(-)

(limited to 'src/mongo/db/timeseries')
diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp b/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp
index b28e198429e..e36c674ad6f 100644
--- a/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp
+++ b/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp
@@ -66,78 +66,105 @@ void _handleElementForExtractAllElementsOnBucketPath(const BSONObj& obj,
                                                      BSONElementSet& elements,
                                                      bool expandArrayOnTrailingField,
                                                      BSONDepthIndex depth,
-                                                     MultikeyComponents* arrayComponents) {
-    if (elem.eoo()) {
-        size_t idx = path.find('.');
-        if (idx != std::string::npos) {
-            invariant(depth != std::numeric_limits<BSONDepthIndex>::max());
-            StringData left = path.substr(0, idx);
-            StringData next = path.substr(idx + 1, path.size());
+                                                     MultikeyComponents* arrayComponents);
 
-            BSONElement e = obj.getField(left);
-
-            if (e.type() == Object) {
-                BSONObj embedded = e.embeddedObject();
-                _handleElementForExtractAllElementsOnBucketPath(embedded,
-                                                                embedded.getField(next),
-                                                                next,
-                                                                elements,
-                                                                expandArrayOnTrailingField,
-                                                                depth + 1,
-                                                                arrayComponents);
-            } else if (e.type() == Array) {
-                bool allDigits = false;
-                if (next.size() > 0 && ctype::isDigit(next[0])) {
-                    unsigned temp = 1;
-                    while (temp < next.size() && ctype::isDigit(next[temp]))
-                        temp++;
-                    allDigits = temp == next.size() || next[temp] == '.';
-                }
-                if (allDigits) {
-                    BSONObj embedded = e.embeddedObject();
+void _handleIntermediateElementForExtractAllElementsOnBucketPath(
+    BSONElement elem,
+    StringData path,
+    BSONElementSet& elements,
+    bool expandArrayOnTrailingField,
+    BSONDepthIndex depth,
+    MultikeyComponents* arrayComponents) {
+    if (elem.type() == Object) {
+        BSONObj embedded = elem.embeddedObject();
+        _handleElementForExtractAllElementsOnBucketPath(embedded,
+                                                        embedded.getField(path),
+                                                        path,
+                                                        elements,
+                                                        expandArrayOnTrailingField,
+                                                        depth + 1,
+                                                        arrayComponents);
+    } else if (elem.type() == Array) {
+        bool allDigits = false;
+        if (path.size() > 0 && ctype::isDigit(path[0])) {
+            unsigned temp = 1;
+            while (temp < path.size() && ctype::isDigit(path[temp]))
+                temp++;
+            allDigits = temp == path.size() || path[temp] == '.';
+        }
+        if (allDigits) {
+            BSONObj embedded = elem.embeddedObject();
+            _handleElementForExtractAllElementsOnBucketPath(embedded,
+                                                            embedded.getField(path),
+                                                            path,
+                                                            elements,
+                                                            expandArrayOnTrailingField,
+                                                            depth + 1,
+                                                            arrayComponents);
+        } else {
+            BSONObjIterator i(elem.embeddedObject());
+            while (i.more()) {
+                BSONElement e2 = i.next();
+                if (e2.type() == Object || e2.type() == Array) {
+                    BSONObj embedded = e2.embeddedObject();
                     _handleElementForExtractAllElementsOnBucketPath(embedded,
-                                                                    embedded.getField(next),
-                                                                    next,
+                                                                    embedded.getField(path),
+                                                                    path,
                                                                     elements,
                                                                     expandArrayOnTrailingField,
                                                                     depth + 1,
                                                                     arrayComponents);
-                } else {
-                    BSONObjIterator i(e.embeddedObject());
-                    while (i.more()) {
-                        BSONElement e2 = i.next();
-                        if (e2.type() == Object || e2.type() == Array) {
-                            BSONObj embedded = e2.embeddedObject();
-                            _handleElementForExtractAllElementsOnBucketPath(
-                                embedded,
-                                embedded.getField(next),
-                                next,
-                                elements,
-                                expandArrayOnTrailingField,
-                                depth + 1,
-                                arrayComponents);
-                        }
-                    }
-                    if (arrayComponents) {
-                        arrayComponents->insert(depth);
-                    }
                 }
-            } else {
-                // do nothing: no match
-            }
-        }
-    } else {
-        if (elem.type() == Array && expandArrayOnTrailingField) {
-            BSONObjIterator i(elem.embeddedObject());
-            while (i.more()) {
-                elements.insert(i.next());
             }
             if (arrayComponents) {
                 arrayComponents->insert(depth);
             }
-        } else {
-            elements.insert(elem);
         }
+    } else {
+        // do nothing: no match
+    }
+}
+
+void _handleTerminalElementForExtractAllElementsOnBucketPath(BSONElement elem,
+                                                             BSONElementSet& elements,
+                                                             bool expandArrayOnTrailingField,
+                                                             BSONDepthIndex depth,
+                                                             MultikeyComponents* arrayComponents) {
+    if (elem.type() == Array && expandArrayOnTrailingField) {
+        BSONObjIterator i(elem.embeddedObject());
+        while (i.more()) {
+            elements.insert(i.next());
+        }
+        if (arrayComponents) {
+            arrayComponents->insert(depth);
+        }
+    } else {
+        elements.insert(elem);
+    }
+}
+
+void _handleElementForExtractAllElementsOnBucketPath(const BSONObj& obj,
+                                                     BSONElement elem,
+                                                     StringData path,
+                                                     BSONElementSet& elements,
+                                                     bool expandArrayOnTrailingField,
+                                                     BSONDepthIndex depth,
+                                                     MultikeyComponents* arrayComponents) {
+    if (elem.eoo()) {
+        size_t idx = path.find('.');
+        if (idx != std::string::npos) {
+            invariant(depth != std::numeric_limits<BSONDepthIndex>::max());
+            StringData left = path.substr(0, idx);
+            StringData next = path.substr(idx + 1, path.size());
+
+            BSONElement e = obj.getField(left);
+
+            _handleIntermediateElementForExtractAllElementsOnBucketPath(
+                e, next, elements, expandArrayOnTrailingField, depth, arrayComponents);
+        }
+    } else {
+        _handleTerminalElementForExtractAllElementsOnBucketPath(
+            elem, elements, expandArrayOnTrailingField, depth, arrayComponents);
     }
 }
 
@@ -165,16 +192,21 @@ boost::optional<BSONColumn> _extractAllElementsAlongBucketPath(
                                                                   depth + 1,
                                                                   arrayComponents);
                     } else if (isCompressed && e.type() == BinData) {
-                        // Unbucketing magic happens here for nested measurement fields (i.e.
-                        // data.a.b) in compressed buckets.
+                        // Unbucketing happens here for nested measurement fields (i.e. data.a.b) in
+                        // compressed buckets. We know that 'e' corresponds to the top-level
+                        // measurement field (i.e. data.a) and we need to iterate over each of the
+                        // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) and do a
+                        // field lookup to extract the actual field we want (i.e. data.a.1.b).
+                        // Thanks to the bucket structure, we know there is no literal field with a
+                        // dot at this depth (e.g. '1.b'), so we can skip that field lookup, but
+                        // it's possible that the element e2 stores an object, array, or other type,
+                        // and we need to figure out how to resolve the rest of the path based on
+                        // the type.
                         BSONColumn storage{e};
                         for (const BSONElement& e2 : storage) {
                             if (!e2.eoo()) {
-                                BSONObj embedded =
-                                    e2.isABSONObj() ? e2.embeddedObject() : BSONObj();
-                                _handleElementForExtractAllElementsOnBucketPath(
-                                    embedded,
-                                    embedded.getField(next),
+                                _handleIntermediateElementForExtractAllElementsOnBucketPath(
+                                    e2,
                                     next,
                                     elements,
                                     expandArrayOnTrailingField,
@@ -182,6 +214,8 @@ boost::optional<BSONColumn> _extractAllElementsAlongBucketPath(
                                     arrayComponents);
                             }
                         }
+                        // Need to pass along the column since it owns the memory referenced by the
+                        // extracted elements.
                         return std::move(storage);
                     }
                 }
@@ -196,43 +230,48 @@ boost::optional<BSONColumn> _extractAllElementsAlongBucketPath(
                                                        depth + 1,
                                                        arrayComponents);
                 } else if (isCompressed && BinData == e.type()) {
-                    // Unbucketing magic happens here for top-level measurement fields (i.e. data.a)
-                    // in compressed buckets.
+                    // Unbucketing happens here for top-level measurement fields (i.e. data.a) in
+                    // compressed buckets. We know that 'e' corresponds to the top-level
+                    // measurement field (i.e. data.a) and we need to iterate over each of the
+                    // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to extract
+                    // the actual field we want.
                     invariant(depth == 1);
                     BSONColumn storage{e};
                     for (const BSONElement& e2 : storage) {
                         if (!e2.eoo()) {
                             BSONObj embedded = e2.isABSONObj() ? e2.embeddedObject() : BSONObj();
-                            _handleElementForExtractAllElementsOnBucketPath(
-                                embedded,
-                                e2,
-                                ""_sd,
-                                elements,
-                                expandArrayOnTrailingField,
-                                depth,
-                                arrayComponents);
+                            _handleTerminalElementForExtractAllElementsOnBucketPath(
+                                e2, elements, expandArrayOnTrailingField, depth, arrayComponents);
                         }
                     }
+                    // Need to pass along the column since it owns the memory referenced by the
+                    // extracted elements.
                     return std::move(storage);
                 }
             }
             break;
         }
         case 2: {
-            // Unbucketing magic happens here for uncompressed buckets.
+            // Unbucketing happens here for uncompressed buckets. We know that 'obj' corresponds to
+            // the top-level measurement field (i.e. data.a) and we need to iterate over each of the
+            // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to extract the actual
+            // field we want. If we are after a top-level field, then we already have the element we
+            // want in 'e'. If we are after a nested field, then we need to recurse.
             invariant(!isCompressed);
             for (const BSONElement& e : obj) {
-                std::string subPath = e.fieldName();
-                if (!path.empty()) {
-                    subPath.append("." + path);
+                if (path.empty()) {
+                    // The top-level measurement field (i.e. data.a) is the indexed field we are
+                    // trying to extract, so we can target it directly.
+                    _handleTerminalElementForExtractAllElementsOnBucketPath(
+                        e, elements, expandArrayOnTrailingField, depth, arrayComponents);
+                } else {
+                    // We'll need to inspect at least one more field, but we can take advantage of
+                    // the bucket structure here to skip checking for any field at this depth with a
+                    // dot in the name (e.g. '1.b') and instead just look at the next level down
+                    // (e.g. look within '1' for a field named 'b').
+                    _handleIntermediateElementForExtractAllElementsOnBucketPath(
+                        e, path, elements, expandArrayOnTrailingField, depth, arrayComponents);
                 }
-                _handleElementForExtractAllElementsOnBucketPath(obj,
-                                                                obj.getField(subPath),
-                                                                subPath,
-                                                                elements,
-                                                                expandArrayOnTrailingField,
-                                                                depth,
-                                                                arrayComponents);
             }
             break;
         }
@@ -244,6 +283,30 @@ boost::optional<BSONColumn> _extractAllElementsAlongBucketPath(
 
 bool _haveArrayAlongBucketDataPath(const BSONObj& obj, StringData path, BSONDepthIndex depth);
 
+bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj,
+                                                   BSONElement elem,
+                                                   StringData path,
+                                                   BSONDepthIndex depth);
+
+bool _handleIntermediateElementForHaveArrayAlongBucketDataPath(BSONElement elem,
+                                                               StringData path,
+                                                               BSONDepthIndex depth) {
+    if (elem.type() == Object) {
+        auto embedded = elem.embeddedObject();
+        return _handleElementForHaveArrayAlongBucketDataPath(
+            embedded, embedded.getField(path), path, depth + 1);
+    } else if (elem.type() == Array) {
+        return true;
+    }
+    // no match
+    return false;
+}
+
+bool _handleTerminalElementForHaveArrayAlongBucketDataPath(BSONElement elem) {
+    return (elem.type() == Array);
+}
+
+
 bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj,
                                                    BSONElement elem,
                                                    StringData path,
@@ -259,20 +322,10 @@ bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj,
 
             BSONElement e = obj.getField(left);
 
-            if (e.type() == Object) {
-                auto embedded = e.embeddedObject();
-                return _handleElementForHaveArrayAlongBucketDataPath(
-                    embedded, embedded.getField(next), next, depth + 1);
-            } else if (e.type() == Array) {
-                return true;
-            } else {
-                // do nothing: no match
-            }
+            return _handleIntermediateElementForHaveArrayAlongBucketDataPath(e, next, depth);
         }
     } else {
-        if (elem.type() == Array) {
-            return true;
-        }
+        return _handleTerminalElementForHaveArrayAlongBucketDataPath(elem);
     }
 
     return false;
@@ -293,13 +346,21 @@ bool _haveArrayAlongBucketDataPath(const BSONObj& obj,
                         return _haveArrayAlongBucketDataPath(
                             e.embeddedObject(), next, isCompressed, depth + 1);
                     } else if (isCompressed && BinData == e.type()) {
-                        // Unbucketing magic happens here for nested measurement fields (i.e.
-                        // data.a.b) in compressed buckets.
+                        // Unbucketing happens here for nested measurement fields (i.e. data.a.b) in
+                        // compressed buckets. We know that 'e' corresponds to the top-level
+                        // measurement field (i.e. data.a) and we need to iterate over each of the
+                        // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) and do a
+                        // field lookup to find the actual field we want (i.e. data.a.1.b).
+                        // Thanks to the bucket structure, we know there is no literal field with a
+                        // dot at this depth (e.g. '1.b'), so we can skip that field lookup, but
+                        // it's possible that the element e2 stores an object, array, or other type,
+                        // and we need to figure out how to resolve the rest of the path based on
+                        // the type.
                         BSONColumn column{e};
                         for (const BSONElement& e2 : column) {
-                            BSONObj embedded = e2.isABSONObj() ? e2.embeddedObject() : BSONObj();
-                            const bool foundArray = _handleElementForHaveArrayAlongBucketDataPath(
-                                embedded, embedded.getField(next), next, depth);
+                            const bool foundArray =
+                                _handleIntermediateElementForHaveArrayAlongBucketDataPath(
+                                    e2, next, depth);
                             if (foundArray) {
                                 return foundArray;
                             }
@@ -312,12 +373,15 @@ bool _haveArrayAlongBucketDataPath(const BSONObj& obj,
                     return _haveArrayAlongBucketDataPath(
                         e.embeddedObject(), StringData(), isCompressed, depth + 1);
                 } else if (BinData == e.type()) {
-                    // Unbucketing magic happens here for top-level measurement fields (i.e. data.a)
-                    // in compressed buckets.
+                    // Unbucketing happens here for top-level measurement fields (i.e. data.a) in
+                    // compressed buckets. We know that 'e' corresponds to the top-level
+                    // measurement field (i.e. data.a) and we need to iterate over each of the
+                    // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to decide whether
+                    // we have any arrays.
                     invariant(isCompressed && depth == 1);
                     BSONColumn column{e};
                     for (const BSONElement& e2 : column) {
-                        if (e2.type() == Array) {
+                        if (_handleTerminalElementForHaveArrayAlongBucketDataPath(e2)) {
                             return true;
                         }
                     }
@@ -326,16 +390,28 @@ bool _haveArrayAlongBucketDataPath(const BSONObj& obj,
             return false;
         }
         case 2: {
-            // Unbucketing magic happens here for uncompressed buckets.
+            // Unbucketing happens here for uncompressed buckets. We know that 'obj' corresponds to
+            // the top-level measurement field (i.e. data.a) and we need to iterate over each of the
+            // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to extract the actual
+            // field we want. If we are after a top-level field, then we already have the element we
+            // want in 'e'. If we are after a nested field, then we need recurse.
             invariant(!isCompressed);
             for (const BSONElement& e : obj) {
-                std::string subPath = e.fieldName();
-                if (!path.empty()) {
-                    subPath.append("." + path);
+                bool foundArray = false;
+                if (path.empty()) {
+                    // The top-level measurement field (i.e. data.a) is the field we are trying to
+                    // check, so we can target it directly.
+                    if (_handleTerminalElementForHaveArrayAlongBucketDataPath(e)) {
+                        return true;
+                    }
+                } else {
+                    // We'll need to inspect at least one more field, but we can take advantage of
+                    // the bucket structure here to skip checking for any field at this depth with a
+                    // dot in the name (e.g. '1.b') and instead just look at the next level down
+                    // (e.g. look within '1' for a field named 'b').
+                    foundArray =
+                        _handleIntermediateElementForHaveArrayAlongBucketDataPath(e, path, depth);
                 }
-                BSONElement sub = obj.getField(subPath);
-                const bool foundArray =
-                    _handleElementForHaveArrayAlongBucketDataPath(obj, sub, subPath, depth);
                 if (foundArray) {
                     return foundArray;
                 }
diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp b/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp
index afabfc4e0a6..072deec9b77 100644
--- a/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp
+++ b/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp
@@ -100,6 +100,12 @@ TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) {
                     b: true
                 }
             }
+        },
+        i: {
+            "1": [
+                {a: true},
+                {a: false}
+            ]
         }
     }
 })");
@@ -115,7 +121,7 @@ TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) {
         ASSERT_FALSE(
             tdps::haveArrayAlongBucketDataPath(obj, "data.b"));  // bucket expansion hides array
         ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.c"));
-        invariant(tdps::haveArrayAlongBucketDataPath(obj, "data.d"));
+        ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.d"));
         ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.e"));
         ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.f"));
         ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.f.a"));
@@ -123,6 +129,8 @@ TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) {
         ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a"));
         ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a.a"));
         ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.h.a.b"));
+        ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.i"));
+        ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.i.a"));
     });
 }
 
@@ -365,6 +373,12 @@ TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) {
                     b: false
                 }
             }
+        },
+        i: {
+            "1": [
+                {a: true},
+                {a: false}
+            ]
         }
     }
 })");
@@ -379,7 +393,8 @@ TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) {
                 expected.emplace(el);
             }
 
-            ASSERT_EQ(actual.size(), expected.size());
+            ASSERT_EQ(actual.size(), expected.size())
+                << "Expected path '" << path << "' to yield " << expectedStorage << " from " << obj;
 
             auto actualIt = actual.begin();
             auto expectedIt = expected.begin();
@@ -408,6 +423,7 @@ TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) {
             BSON_ARRAY(BSON("a" << BSON("b" << true)) << BSON("a" << BSON("b" << false))));
         assertExtractionMatches("data.h.a"_sd, BSON_ARRAY(BSON("b" << true) << BSON("b" << false)));
         assertExtractionMatches("data.h.a.b"_sd, BSON_ARRAY(true << false));
+        assertExtractionMatches("data.i.a"_sd, BSON_ARRAY(true << false));
     });
 }
 }  // namespace
-- 
cgit v1.2.1