From 3cc20216a850af1d4bf63956740d73e8fc3779df Mon Sep 17 00:00:00 2001 From: Dan Larkin-York Date: Tue, 5 Jul 2022 21:33:57 +0000 Subject: SERVER-67622 Optimize timeseries_dotted_path_support functions and fix array handling --- .../timeseries/timeseries_dotted_path_support.cpp | 306 +++++++++++++-------- .../timeseries_dotted_path_support_test.cpp | 20 +- 2 files changed, 209 insertions(+), 117 deletions(-) (limited to 'src/mongo/db/timeseries') diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp b/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp index b28e198429e..e36c674ad6f 100644 --- a/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp +++ b/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp @@ -66,78 +66,105 @@ void _handleElementForExtractAllElementsOnBucketPath(const BSONObj& obj, BSONElementSet& elements, bool expandArrayOnTrailingField, BSONDepthIndex depth, - MultikeyComponents* arrayComponents) { - if (elem.eoo()) { - size_t idx = path.find('.'); - if (idx != std::string::npos) { - invariant(depth != std::numeric_limits::max()); - StringData left = path.substr(0, idx); - StringData next = path.substr(idx + 1, path.size()); + MultikeyComponents* arrayComponents); - BSONElement e = obj.getField(left); - - if (e.type() == Object) { - BSONObj embedded = e.embeddedObject(); - _handleElementForExtractAllElementsOnBucketPath(embedded, - embedded.getField(next), - next, - elements, - expandArrayOnTrailingField, - depth + 1, - arrayComponents); - } else if (e.type() == Array) { - bool allDigits = false; - if (next.size() > 0 && ctype::isDigit(next[0])) { - unsigned temp = 1; - while (temp < next.size() && ctype::isDigit(next[temp])) - temp++; - allDigits = temp == next.size() || next[temp] == '.'; - } - if (allDigits) { - BSONObj embedded = e.embeddedObject(); +void _handleIntermediateElementForExtractAllElementsOnBucketPath( + BSONElement elem, + StringData path, + BSONElementSet& elements, + bool expandArrayOnTrailingField, + BSONDepthIndex depth, + MultikeyComponents* arrayComponents) { + if (elem.type() == Object) { + BSONObj embedded = elem.embeddedObject(); + _handleElementForExtractAllElementsOnBucketPath(embedded, + embedded.getField(path), + path, + elements, + expandArrayOnTrailingField, + depth + 1, + arrayComponents); + } else if (elem.type() == Array) { + bool allDigits = false; + if (path.size() > 0 && ctype::isDigit(path[0])) { + unsigned temp = 1; + while (temp < path.size() && ctype::isDigit(path[temp])) + temp++; + allDigits = temp == path.size() || path[temp] == '.'; + } + if (allDigits) { + BSONObj embedded = elem.embeddedObject(); + _handleElementForExtractAllElementsOnBucketPath(embedded, + embedded.getField(path), + path, + elements, + expandArrayOnTrailingField, + depth + 1, + arrayComponents); + } else { + BSONObjIterator i(elem.embeddedObject()); + while (i.more()) { + BSONElement e2 = i.next(); + if (e2.type() == Object || e2.type() == Array) { + BSONObj embedded = e2.embeddedObject(); _handleElementForExtractAllElementsOnBucketPath(embedded, - embedded.getField(next), - next, + embedded.getField(path), + path, elements, expandArrayOnTrailingField, depth + 1, arrayComponents); - } else { - BSONObjIterator i(e.embeddedObject()); - while (i.more()) { - BSONElement e2 = i.next(); - if (e2.type() == Object || e2.type() == Array) { - BSONObj embedded = e2.embeddedObject(); - _handleElementForExtractAllElementsOnBucketPath( - embedded, - embedded.getField(next), - next, - elements, - expandArrayOnTrailingField, - depth + 1, - arrayComponents); - } - } - if (arrayComponents) { - arrayComponents->insert(depth); - } } - } else { - // do nothing: no match - } - } - } else { - if (elem.type() == Array && expandArrayOnTrailingField) { - BSONObjIterator i(elem.embeddedObject()); - while (i.more()) { - elements.insert(i.next()); } if (arrayComponents) { arrayComponents->insert(depth); } - } else { - elements.insert(elem); } + } else { + // do nothing: no match + } +} + +void _handleTerminalElementForExtractAllElementsOnBucketPath(BSONElement elem, + BSONElementSet& elements, + bool expandArrayOnTrailingField, + BSONDepthIndex depth, + MultikeyComponents* arrayComponents) { + if (elem.type() == Array && expandArrayOnTrailingField) { + BSONObjIterator i(elem.embeddedObject()); + while (i.more()) { + elements.insert(i.next()); + } + if (arrayComponents) { + arrayComponents->insert(depth); + } + } else { + elements.insert(elem); + } +} + +void _handleElementForExtractAllElementsOnBucketPath(const BSONObj& obj, + BSONElement elem, + StringData path, + BSONElementSet& elements, + bool expandArrayOnTrailingField, + BSONDepthIndex depth, + MultikeyComponents* arrayComponents) { + if (elem.eoo()) { + size_t idx = path.find('.'); + if (idx != std::string::npos) { + invariant(depth != std::numeric_limits::max()); + StringData left = path.substr(0, idx); + StringData next = path.substr(idx + 1, path.size()); + + BSONElement e = obj.getField(left); + + _handleIntermediateElementForExtractAllElementsOnBucketPath( + e, next, elements, expandArrayOnTrailingField, depth, arrayComponents); + } + } else { + _handleTerminalElementForExtractAllElementsOnBucketPath( + elem, elements, expandArrayOnTrailingField, depth, arrayComponents); } } @@ -165,16 +192,21 @@ boost::optional _extractAllElementsAlongBucketPath( depth + 1, arrayComponents); } else if (isCompressed && e.type() == BinData) { - // Unbucketing magic happens here for nested measurement fields (i.e. - // data.a.b) in compressed buckets. + // Unbucketing happens here for nested measurement fields (i.e. data.a.b) in + // compressed buckets. We know that 'e' corresponds to the top-level + // measurement field (i.e. data.a) and we need to iterate over each of the + // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) and do a + // field lookup to extract the actual field we want (i.e. data.a.1.b). + // Thanks to the bucket structure, we know there is no literal field with a + // dot at this depth (e.g. '1.b'), so we can skip that field lookup, but + // it's possible that the element e2 stores an object, array, or other type, + // and we need to figure out how to resolve the rest of the path based on + // the type. BSONColumn storage{e}; for (const BSONElement& e2 : storage) { if (!e2.eoo()) { - BSONObj embedded = - e2.isABSONObj() ? e2.embeddedObject() : BSONObj(); - _handleElementForExtractAllElementsOnBucketPath( - embedded, - embedded.getField(next), + _handleIntermediateElementForExtractAllElementsOnBucketPath( + e2, next, elements, expandArrayOnTrailingField, @@ -182,6 +214,8 @@ boost::optional _extractAllElementsAlongBucketPath( arrayComponents); } } + // Need to pass along the column since it owns the memory referenced by the + // extracted elements. return std::move(storage); } } @@ -196,43 +230,48 @@ boost::optional _extractAllElementsAlongBucketPath( depth + 1, arrayComponents); } else if (isCompressed && BinData == e.type()) { - // Unbucketing magic happens here for top-level measurement fields (i.e. data.a) - // in compressed buckets. + // Unbucketing happens here for top-level measurement fields (i.e. data.a) in + // compressed buckets. We know that 'e' corresponds to the top-level + // measurement field (i.e. data.a) and we need to iterate over each of the + // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to extract + // the actual field we want. invariant(depth == 1); BSONColumn storage{e}; for (const BSONElement& e2 : storage) { if (!e2.eoo()) { BSONObj embedded = e2.isABSONObj() ? e2.embeddedObject() : BSONObj(); - _handleElementForExtractAllElementsOnBucketPath( - embedded, - e2, - ""_sd, - elements, - expandArrayOnTrailingField, - depth, - arrayComponents); + _handleTerminalElementForExtractAllElementsOnBucketPath( + e2, elements, expandArrayOnTrailingField, depth, arrayComponents); } } + // Need to pass along the column since it owns the memory referenced by the + // extracted elements. return std::move(storage); } } break; } case 2: { - // Unbucketing magic happens here for uncompressed buckets. + // Unbucketing happens here for uncompressed buckets. We know that 'obj' corresponds to + // the top-level measurement field (i.e. data.a) and we need to iterate over each of the + // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to extract the actual + // field we want. If we are after a top-level field, then we already have the element we + // want in 'e'. If we are after a nested field, then we need to recurse. invariant(!isCompressed); for (const BSONElement& e : obj) { - std::string subPath = e.fieldName(); - if (!path.empty()) { - subPath.append("." + path); + if (path.empty()) { + // The top-level measurement field (i.e. data.a) is the indexed field we are + // trying to extract, so we can target it directly. + _handleTerminalElementForExtractAllElementsOnBucketPath( + e, elements, expandArrayOnTrailingField, depth, arrayComponents); + } else { + // We'll need to inspect at least one more field, but we can take advantage of + // the bucket structure here to skip checking for any field at this depth with a + // dot in the name (e.g. '1.b') and instead just look at the next level down + // (e.g. look within '1' for a field named 'b'). + _handleIntermediateElementForExtractAllElementsOnBucketPath( + e, path, elements, expandArrayOnTrailingField, depth, arrayComponents); } - _handleElementForExtractAllElementsOnBucketPath(obj, - obj.getField(subPath), - subPath, - elements, - expandArrayOnTrailingField, - depth, - arrayComponents); } break; } @@ -244,6 +283,30 @@ boost::optional _extractAllElementsAlongBucketPath( bool _haveArrayAlongBucketDataPath(const BSONObj& obj, StringData path, BSONDepthIndex depth); +bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj, + BSONElement elem, + StringData path, + BSONDepthIndex depth); + +bool _handleIntermediateElementForHaveArrayAlongBucketDataPath(BSONElement elem, + StringData path, + BSONDepthIndex depth) { + if (elem.type() == Object) { + auto embedded = elem.embeddedObject(); + return _handleElementForHaveArrayAlongBucketDataPath( + embedded, embedded.getField(path), path, depth + 1); + } else if (elem.type() == Array) { + return true; + } + // no match + return false; +} + +bool _handleTerminalElementForHaveArrayAlongBucketDataPath(BSONElement elem) { + return (elem.type() == Array); +} + + bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj, BSONElement elem, StringData path, @@ -259,20 +322,10 @@ bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj, BSONElement e = obj.getField(left); - if (e.type() == Object) { - auto embedded = e.embeddedObject(); - return _handleElementForHaveArrayAlongBucketDataPath( - embedded, embedded.getField(next), next, depth + 1); - } else if (e.type() == Array) { - return true; - } else { - // do nothing: no match - } + return _handleIntermediateElementForHaveArrayAlongBucketDataPath(e, next, depth); } } else { - if (elem.type() == Array) { - return true; - } + return _handleTerminalElementForHaveArrayAlongBucketDataPath(elem); } return false; @@ -293,13 +346,21 @@ bool _haveArrayAlongBucketDataPath(const BSONObj& obj, return _haveArrayAlongBucketDataPath( e.embeddedObject(), next, isCompressed, depth + 1); } else if (isCompressed && BinData == e.type()) { - // Unbucketing magic happens here for nested measurement fields (i.e. - // data.a.b) in compressed buckets. + // Unbucketing happens here for nested measurement fields (i.e. data.a.b) in + // compressed buckets. We know that 'e' corresponds to the top-level + // measurement field (i.e. data.a) and we need to iterate over each of the + // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) and do a + // field lookup to find the actual field we want (i.e. data.a.1.b). + // Thanks to the bucket structure, we know there is no literal field with a + // dot at this depth (e.g. '1.b'), so we can skip that field lookup, but + // it's possible that the element e2 stores an object, array, or other type, + // and we need to figure out how to resolve the rest of the path based on + // the type. BSONColumn column{e}; for (const BSONElement& e2 : column) { - BSONObj embedded = e2.isABSONObj() ? e2.embeddedObject() : BSONObj(); - const bool foundArray = _handleElementForHaveArrayAlongBucketDataPath( - embedded, embedded.getField(next), next, depth); + const bool foundArray = + _handleIntermediateElementForHaveArrayAlongBucketDataPath( + e2, next, depth); if (foundArray) { return foundArray; } @@ -312,12 +373,15 @@ bool _haveArrayAlongBucketDataPath(const BSONObj& obj, return _haveArrayAlongBucketDataPath( e.embeddedObject(), StringData(), isCompressed, depth + 1); } else if (BinData == e.type()) { - // Unbucketing magic happens here for top-level measurement fields (i.e. data.a) - // in compressed buckets. + // Unbucketing happens here for top-level measurement fields (i.e. data.a) in + // compressed buckets. We know that 'e' corresponds to the top-level + // measurement field (i.e. data.a) and we need to iterate over each of the + // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to decide whether + // we have any arrays. invariant(isCompressed && depth == 1); BSONColumn column{e}; for (const BSONElement& e2 : column) { - if (e2.type() == Array) { + if (_handleTerminalElementForHaveArrayAlongBucketDataPath(e2)) { return true; } } @@ -326,16 +390,28 @@ bool _haveArrayAlongBucketDataPath(const BSONObj& obj, return false; } case 2: { - // Unbucketing magic happens here for uncompressed buckets. + // Unbucketing happens here for uncompressed buckets. We know that 'obj' corresponds to + // the top-level measurement field (i.e. data.a) and we need to iterate over each of the + // numerically-indexed entries (i.e. data.a.1, data.a.5, etc.) to extract the actual + // field we want. If we are after a top-level field, then we already have the element we + // want in 'e'. If we are after a nested field, then we need recurse. invariant(!isCompressed); for (const BSONElement& e : obj) { - std::string subPath = e.fieldName(); - if (!path.empty()) { - subPath.append("." + path); + bool foundArray = false; + if (path.empty()) { + // The top-level measurement field (i.e. data.a) is the field we are trying to + // check, so we can target it directly. + if (_handleTerminalElementForHaveArrayAlongBucketDataPath(e)) { + return true; + } + } else { + // We'll need to inspect at least one more field, but we can take advantage of + // the bucket structure here to skip checking for any field at this depth with a + // dot in the name (e.g. '1.b') and instead just look at the next level down + // (e.g. look within '1' for a field named 'b'). + foundArray = + _handleIntermediateElementForHaveArrayAlongBucketDataPath(e, path, depth); } - BSONElement sub = obj.getField(subPath); - const bool foundArray = - _handleElementForHaveArrayAlongBucketDataPath(obj, sub, subPath, depth); if (foundArray) { return foundArray; } diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp b/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp index afabfc4e0a6..072deec9b77 100644 --- a/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp +++ b/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp @@ -100,6 +100,12 @@ TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) { b: true } } + }, + i: { + "1": [ + {a: true}, + {a: false} + ] } } })"); @@ -115,7 +121,7 @@ TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) { ASSERT_FALSE( tdps::haveArrayAlongBucketDataPath(obj, "data.b")); // bucket expansion hides array ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.c")); - invariant(tdps::haveArrayAlongBucketDataPath(obj, "data.d")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.d")); ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.e")); ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.f")); ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.f.a")); @@ -123,6 +129,8 @@ TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) { ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a")); ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a.a")); ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.h.a.b")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.i")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.i.a")); }); } @@ -365,6 +373,12 @@ TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) { b: false } } + }, + i: { + "1": [ + {a: true}, + {a: false} + ] } } })"); @@ -379,7 +393,8 @@ TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) { expected.emplace(el); } - ASSERT_EQ(actual.size(), expected.size()); + ASSERT_EQ(actual.size(), expected.size()) + << "Expected path '" << path << "' to yield " << expectedStorage << " from " << obj; auto actualIt = actual.begin(); auto expectedIt = expected.begin(); @@ -408,6 +423,7 @@ TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) { BSON_ARRAY(BSON("a" << BSON("b" << true)) << BSON("a" << BSON("b" << false)))); assertExtractionMatches("data.h.a"_sd, BSON_ARRAY(BSON("b" << true) << BSON("b" << false))); assertExtractionMatches("data.h.a.b"_sd, BSON_ARRAY(true << false)); + assertExtractionMatches("data.i.a"_sd, BSON_ARRAY(true << false)); }); } } // namespace -- cgit v1.2.1