diff options
author | Dan Larkin-York <dan.larkin-york@mongodb.com> | 2022-05-06 18:09:55 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-05-06 18:34:06 +0000 |
commit | c4577480b324634daf96ce685faf73f106f33e0d (patch) | |
tree | bf0623d7333cd52f7c28964b6c438bbbe7c5f5d3 | |
parent | 8adb070dcb43842e61da3e15415ad4e132d1655a (diff) | |
download | mongo-c4577480b324634daf96ce685faf73f106f33e0d.tar.gz |
SERVER-65406 Handle compressed buckets in timeseries dotted path support library
-rw-r--r-- | jstests/core/timeseries/timeseries_metric_index_2dsphere.js | 35 | ||||
-rw-r--r-- | src/mongo/db/index/expression_keys_private.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/index/s2_bucket_key_generator_test.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/timeseries/SConscript | 3 | ||||
-rw-r--r-- | src/mongo/db/timeseries/bucket_compression.cpp | 21 | ||||
-rw-r--r-- | src/mongo/db/timeseries/bucket_compression.h | 5 | ||||
-rw-r--r-- | src/mongo/db/timeseries/timeseries_dotted_path_support.cpp | 300 | ||||
-rw-r--r-- | src/mongo/db/timeseries/timeseries_dotted_path_support.h | 30 | ||||
-rw-r--r-- | src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp | 355 |
9 files changed, 523 insertions, 247 deletions
diff --git a/jstests/core/timeseries/timeseries_metric_index_2dsphere.js b/jstests/core/timeseries/timeseries_metric_index_2dsphere.js index 09ff02c8999..e47119b5bbd 100644 --- a/jstests/core/timeseries/timeseries_metric_index_2dsphere.js +++ b/jstests/core/timeseries/timeseries_metric_index_2dsphere.js @@ -66,33 +66,52 @@ TimeseriesTest.run((insert) => { const twoDSphereDocs = [ { _id: 0, - [timeFieldName]: ISODate(), + [timeFieldName]: ISODate("2022-04-01T00:00:00.000Z"), [metaFieldName]: "m1", location: {type: "Point", coordinates: [40, -70]} }, { _id: 1, - [timeFieldName]: ISODate(), + [timeFieldName]: ISODate("2022-04-01T00:01:00.000Z"), // should land in same bucket [metaFieldName]: "m1", location: {type: "Point", coordinates: [40.1, -70.1]} }, { _id: 2, - [timeFieldName]: ISODate(), + [timeFieldName]: ISODate("2022-04-01T00:00:00.000Z"), [metaFieldName]: "m2", location: {type: "Point", coordinates: [40.2, -70.2]} }, { _id: 3, - [timeFieldName]: ISODate(), + [timeFieldName]: ISODate("2022-04-01T00:01:00.000Z"), // should land in same bucket [metaFieldName]: "m2", - location: {type: "Point", coordinates: [40.2, -70.2]} + location: {type: "Point", coordinates: [40.3, -70.3]} + }, + { + _id: 4, + [timeFieldName]: ISODate("2022-04-01T00:02:00.000Z"), // should land in same bucket + [metaFieldName]: "m2", + location: {type: "Point", coordinates: [40.4, -70.4]} + }, + { + _id: 5, + [timeFieldName]: + ISODate("2022-04-01T02:00:00.000Z"), // should open new bucket and compress old one + [metaFieldName]: "m2", + location: {type: "Point", coordinates: [40.5, -70.5]} + }, + { + _id: 6, + [timeFieldName]: ISODate("2022-04-01T02:01:00.000Z"), // should land in same bucket + [metaFieldName]: "m2" }, - {_id: 4, [timeFieldName]: ISODate(), [metaFieldName]: "m2"}, - {_id: 5, [timeFieldName]: ISODate(), [metaFieldName]: "m3"}, + {_id: 7, [timeFieldName]: ISODate("2022-04-01T00:00:00.000Z"), [metaFieldName]: "m3"}, ]; assert.commandWorked(insert(timeseriescoll, twoDSphereDocs), 'Failed to insert twoDSphereDocs: ' + tojson(twoDSphereDocs)); + assert.eq(bucketscoll.count(), 4); + printjson(bucketscoll.find({}).toArray()); // Test invalid documents const docWithInvalidCoordinates = { @@ -140,7 +159,7 @@ TimeseriesTest.run((insert) => { timeseriescoll.find({location: {$geoWithin: {$center: [[40, -70], .15]}}}).toArray().length, geoWithinPlan2d); - assert.eq(4, + assert.eq(6, timeseriescoll .aggregate([{ $geoNear: { diff --git a/src/mongo/db/index/expression_keys_private.cpp b/src/mongo/db/index/expression_keys_private.cpp index 4bc13b10421..3d957f15519 100644 --- a/src/mongo/db/index/expression_keys_private.cpp +++ b/src/mongo/db/index/expression_keys_private.cpp @@ -706,12 +706,13 @@ void ExpressionKeysPrivate::getS2Keys(SharedBufferFragmentBuilder& pooledBufferB std::vector<KeyString::HeapBuilder> updatedKeysToAdd; if (IndexNames::GEO_2DSPHERE_BUCKET == keyElem.str()) { - timeseries::dotted_path_support::extractAllElementsAlongBucketPath( - obj, - keyElem.fieldName(), - fieldElements, - expandArrayOnTrailingField, - arrayComponents); + auto elementStorage = + timeseries::dotted_path_support::extractAllElementsAlongBucketPath( + obj, + keyElem.fieldName(), + fieldElements, + expandArrayOnTrailingField, + arrayComponents); // null, undefined, {} and [] should all behave like there is no geo field. So we // look for these cases and ignore those measurements if we find them. diff --git a/src/mongo/db/index/s2_bucket_key_generator_test.cpp b/src/mongo/db/index/s2_bucket_key_generator_test.cpp index f501f5cc781..89187d275b0 100644 --- a/src/mongo/db/index/s2_bucket_key_generator_test.cpp +++ b/src/mongo/db/index/s2_bucket_key_generator_test.cpp @@ -137,7 +137,7 @@ struct S2BucketKeyGeneratorTest : public unittest::Test { TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeys) { BSONObj keyPattern = fromjson("{'data.geo': '2dsphere_bucket'}"); BSONObj genKeysFrom = fromjson( - "{data: {geo: {" + "{control: {version: 1}, data: {geo: {" "'0': {type: 'Point', coordinates: [0, 0]}," "'1': {type: 'Point', coordinates: [3, 3]}" "}}}"); @@ -165,7 +165,7 @@ TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeys) { TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeysSubField) { BSONObj keyPattern = fromjson("{'data.geo.sub': '2dsphere_bucket'}"); BSONObj genKeysFrom = fromjson( - "{data: {geo: {" + "{control: {version: 1}, data: {geo: {" "'0': {sub: {type: 'Point', coordinates: [0, 0]}}," "'1': {sub: {type: 'Point', coordinates: [3, 3]}}" "}}}"); @@ -194,7 +194,7 @@ TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeysSubField) { TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeysDeepSubField) { BSONObj keyPattern = fromjson("{'data.geo.sub1.sub2.sub3': '2dsphere_bucket'}"); BSONObj genKeysFrom = fromjson( - "{data: {geo: {" + "{control: {version: 1}, data: {geo: {" "'0': {sub1: {sub2: {sub3: {type: 'Point', coordinates: [0, 0]}}}}," "'1': {sub1: {sub2: {sub3: {type: 'Point', coordinates: [3, 3]}}}}" "}}}"); @@ -223,7 +223,7 @@ TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeysDeepSubField) { TEST_F(S2BucketKeyGeneratorTest, GetS2BucketKeysSubFieldSomeMissing) { BSONObj keyPattern = fromjson("{'data.geo.sub': '2dsphere_bucket'}"); BSONObj genKeysFrom = fromjson( - "{data: {geo: {" + "{control: {version: 1}, data: {geo: {" "'0': {sub: {type: 'Point', coordinates: [0, 0]}}," "'1': {sub: {}}," "'2': {sub: null}," diff --git a/src/mongo/db/timeseries/SConscript b/src/mongo/db/timeseries/SConscript index 519725d38b4..aa103c25a8b 100644 --- a/src/mongo/db/timeseries/SConscript +++ b/src/mongo/db/timeseries/SConscript @@ -88,12 +88,14 @@ env.Library( LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/base', '$BUILD_DIR/mongo/bson/mutable/mutable_bson', + '$BUILD_DIR/mongo/bson/util/bson_column', '$BUILD_DIR/mongo/db/exec/bucket_unpacker', '$BUILD_DIR/mongo/db/index_commands_idl', '$BUILD_DIR/mongo/db/index_names', '$BUILD_DIR/mongo/db/namespace_string', '$BUILD_DIR/mongo/db/ops/write_ops_parsers', '$BUILD_DIR/mongo/db/storage/storage_options', + 'bucket_compression', 'timeseries_options', ], ) @@ -130,6 +132,7 @@ env.CppUnitTest( '$BUILD_DIR/mongo/db/catalog/catalog_test_fixture', '$BUILD_DIR/mongo/db/catalog_raii', 'bucket_catalog', + 'bucket_compression', 'timeseries_conversion_util', 'timeseries_options', ], diff --git a/src/mongo/db/timeseries/bucket_compression.cpp b/src/mongo/db/timeseries/bucket_compression.cpp index 2f8eefd3a84..7d6e6961687 100644 --- a/src/mongo/db/timeseries/bucket_compression.cpp +++ b/src/mongo/db/timeseries/bucket_compression.cpp @@ -342,5 +342,26 @@ CompressionResult compressBucket(const BSONObj& bucketDoc, return {}; } +bool isCompressedBucket(const BSONObj& bucketDoc) { + auto&& controlField = bucketDoc[timeseries::kBucketControlFieldName]; + uassert(6540600, + "Time-series bucket documents must have 'control' object present", + controlField && controlField.type() == BSONType::Object); + + auto&& versionField = controlField.Obj()[timeseries::kBucketControlVersionFieldName]; + uassert(6540601, + "Time-series bucket documents must have 'control.version' field present", + versionField && isNumericBSONType(versionField.type())); + auto version = versionField.Number(); + + if (version == 1) { + return false; + } else if (version == 2) { + return true; + } else { + uasserted(6540602, "Invalid bucket version"); + } +} + } // namespace timeseries } // namespace mongo diff --git a/src/mongo/db/timeseries/bucket_compression.h b/src/mongo/db/timeseries/bucket_compression.h index 7921bca1227..e70bec965d7 100644 --- a/src/mongo/db/timeseries/bucket_compression.h +++ b/src/mongo/db/timeseries/bucket_compression.h @@ -60,5 +60,10 @@ CompressionResult compressBucket(const BSONObj& bucketDoc, const NamespaceString& nss, bool validateDecompression); +/** + * Returns whether a timeseries bucket has been compressed to the v2 format. + */ +bool isCompressedBucket(const BSONObj& bucketDoc); + } // namespace timeseries } // namespace mongo diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp b/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp index fc388e5fa0c..b28e198429e 100644 --- a/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp +++ b/src/mongo/db/timeseries/timeseries_dotted_path_support.cpp @@ -37,6 +37,8 @@ #include "mongo/bson/bsonmisc.h" #include "mongo/bson/bsonobj.h" #include "mongo/bson/bsonobjbuilder.h" +#include "mongo/bson/util/bsoncolumn.h" +#include "mongo/db/timeseries/bucket_compression.h" #include "mongo/db/timeseries/timeseries_constants.h" #include "mongo/util/ctype.h" @@ -58,93 +60,130 @@ boost::optional<std::pair<StringData, StringData>> _splitPath(StringData path) { return std::make_pair(left, next); } -template <typename BSONElementColl> -void _extractAllElementsAlongBucketPath(const BSONObj& obj, - StringData path, - BSONElementColl& elements, - bool expandArrayOnTrailingField, - BSONDepthIndex depth, - MultikeyComponents* arrayComponents) { - auto handleElement = [&](BSONElement elem, StringData path) -> void { - if (elem.eoo()) { - size_t idx = path.find('.'); - if (idx != std::string::npos) { - invariant(depth != std::numeric_limits<BSONDepthIndex>::max()); - StringData left = path.substr(0, idx); - StringData next = path.substr(idx + 1, path.size()); +void _handleElementForExtractAllElementsOnBucketPath(const BSONObj& obj, + BSONElement elem, + StringData path, + BSONElementSet& elements, + bool expandArrayOnTrailingField, + BSONDepthIndex depth, + MultikeyComponents* arrayComponents) { + if (elem.eoo()) { + size_t idx = path.find('.'); + if (idx != std::string::npos) { + invariant(depth != std::numeric_limits<BSONDepthIndex>::max()); + StringData left = path.substr(0, idx); + StringData next = path.substr(idx + 1, path.size()); - BSONElement e = obj.getField(left); + BSONElement e = obj.getField(left); - if (e.type() == Object) { - _extractAllElementsAlongBucketPath(e.embeddedObject(), - next, - elements, - expandArrayOnTrailingField, - depth + 1, - arrayComponents); - } else if (e.type() == Array) { - bool allDigits = false; - if (next.size() > 0 && ctype::isDigit(next[0])) { - unsigned temp = 1; - while (temp < next.size() && ctype::isDigit(next[temp])) - temp++; - allDigits = temp == next.size() || next[temp] == '.'; - } - if (allDigits) { - _extractAllElementsAlongBucketPath(e.embeddedObject(), - next, - elements, - expandArrayOnTrailingField, - depth + 1, - arrayComponents); - } else { - BSONObjIterator i(e.embeddedObject()); - while (i.more()) { - BSONElement e2 = i.next(); - if (e2.type() == Object || e2.type() == Array) - _extractAllElementsAlongBucketPath(e2.embeddedObject(), - next, - elements, - expandArrayOnTrailingField, - depth + 1, - arrayComponents); - } - if (arrayComponents) { - arrayComponents->insert(depth); + if (e.type() == Object) { + BSONObj embedded = e.embeddedObject(); + _handleElementForExtractAllElementsOnBucketPath(embedded, + embedded.getField(next), + next, + elements, + expandArrayOnTrailingField, + depth + 1, + arrayComponents); + } else if (e.type() == Array) { + bool allDigits = false; + if (next.size() > 0 && ctype::isDigit(next[0])) { + unsigned temp = 1; + while (temp < next.size() && ctype::isDigit(next[temp])) + temp++; + allDigits = temp == next.size() || next[temp] == '.'; + } + if (allDigits) { + BSONObj embedded = e.embeddedObject(); + _handleElementForExtractAllElementsOnBucketPath(embedded, + embedded.getField(next), + next, + elements, + expandArrayOnTrailingField, + depth + 1, + arrayComponents); + } else { + BSONObjIterator i(e.embeddedObject()); + while (i.more()) { + BSONElement e2 = i.next(); + if (e2.type() == Object || e2.type() == Array) { + BSONObj embedded = e2.embeddedObject(); + _handleElementForExtractAllElementsOnBucketPath( + embedded, + embedded.getField(next), + next, + elements, + expandArrayOnTrailingField, + depth + 1, + arrayComponents); } } - } else { - // do nothing: no match - } - } - } else { - if (elem.type() == Array && expandArrayOnTrailingField) { - BSONObjIterator i(elem.embeddedObject()); - while (i.more()) { - elements.insert(i.next()); - } - if (arrayComponents) { - arrayComponents->insert(depth); + if (arrayComponents) { + arrayComponents->insert(depth); + } } } else { - elements.insert(elem); + // do nothing: no match } } - }; + } else { + if (elem.type() == Array && expandArrayOnTrailingField) { + BSONObjIterator i(elem.embeddedObject()); + while (i.more()) { + elements.insert(i.next()); + } + if (arrayComponents) { + arrayComponents->insert(depth); + } + } else { + elements.insert(elem); + } + } +} +boost::optional<BSONColumn> _extractAllElementsAlongBucketPath( + const BSONObj& obj, + StringData path, + BSONElementSet& elements, + bool expandArrayOnTrailingField, + bool isCompressed, + BSONDepthIndex depth, + MultikeyComponents* arrayComponents) { switch (depth) { case 0: case 1: { if (auto res = _splitPath(path)) { auto& [left, next] = *res; BSONElement e = obj.getField(left); - if (e.type() == Object && (depth > 0 || left == timeseries::kBucketDataFieldName)) { - _extractAllElementsAlongBucketPath(e.embeddedObject(), - next, - elements, - expandArrayOnTrailingField, - depth + 1, - arrayComponents); + if (depth > 0 || left == timeseries::kBucketDataFieldName) { + if (e.type() == Object) { + return _extractAllElementsAlongBucketPath(e.embeddedObject(), + next, + elements, + expandArrayOnTrailingField, + isCompressed, + depth + 1, + arrayComponents); + } else if (isCompressed && e.type() == BinData) { + // Unbucketing magic happens here for nested measurement fields (i.e. + // data.a.b) in compressed buckets. + BSONColumn storage{e}; + for (const BSONElement& e2 : storage) { + if (!e2.eoo()) { + BSONObj embedded = + e2.isABSONObj() ? e2.embeddedObject() : BSONObj(); + _handleElementForExtractAllElementsOnBucketPath( + embedded, + embedded.getField(next), + next, + elements, + expandArrayOnTrailingField, + depth, + arrayComponents); + } + } + return std::move(storage); + } } } else { BSONElement e = obj.getField(path); @@ -153,32 +192,55 @@ void _extractAllElementsAlongBucketPath(const BSONObj& obj, StringData(), elements, expandArrayOnTrailingField, + isCompressed, depth + 1, arrayComponents); + } else if (isCompressed && BinData == e.type()) { + // Unbucketing magic happens here for top-level measurement fields (i.e. data.a) + // in compressed buckets. + invariant(depth == 1); + BSONColumn storage{e}; + for (const BSONElement& e2 : storage) { + if (!e2.eoo()) { + BSONObj embedded = e2.isABSONObj() ? e2.embeddedObject() : BSONObj(); + _handleElementForExtractAllElementsOnBucketPath( + embedded, + e2, + ""_sd, + elements, + expandArrayOnTrailingField, + depth, + arrayComponents); + } + } + return std::move(storage); } } break; } case 2: { - // Unbucketing magic happens here. + // Unbucketing magic happens here for uncompressed buckets. + invariant(!isCompressed); for (const BSONElement& e : obj) { std::string subPath = e.fieldName(); if (!path.empty()) { subPath.append("." + path); } - BSONElement sub = obj.getField(subPath); - handleElement(sub, subPath); + _handleElementForExtractAllElementsOnBucketPath(obj, + obj.getField(subPath), + subPath, + elements, + expandArrayOnTrailingField, + depth, + arrayComponents); } break; } - default: { - BSONElement e = obj.getField(path); - handleElement(e, path); - break; - } + default: { MONGO_UNREACHABLE; } } -} + return boost::none; +} bool _haveArrayAlongBucketDataPath(const BSONObj& obj, StringData path, BSONDepthIndex depth); @@ -198,7 +260,9 @@ bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj, BSONElement e = obj.getField(left); if (e.type() == Object) { - return _haveArrayAlongBucketDataPath(e.embeddedObject(), next, depth + 1); + auto embedded = e.embeddedObject(); + return _handleElementForHaveArrayAlongBucketDataPath( + embedded, embedded.getField(next), next, depth + 1); } else if (e.type() == Array) { return true; } else { @@ -214,27 +278,56 @@ bool _handleElementForHaveArrayAlongBucketDataPath(const BSONObj& obj, return false; } -bool _haveArrayAlongBucketDataPath(const BSONObj& obj, StringData path, BSONDepthIndex depth) { +bool _haveArrayAlongBucketDataPath(const BSONObj& obj, + StringData path, + bool isCompressed, + BSONDepthIndex depth) { switch (depth) { case 0: case 1: { if (auto res = _splitPath(path)) { auto& [left, next] = *res; BSONElement e = obj.getField(left); - if (e.type() == Object && (depth > 0 || left == timeseries::kBucketDataFieldName)) { - return _haveArrayAlongBucketDataPath(e.embeddedObject(), next, depth + 1); + if (depth > 0 || left == timeseries::kBucketDataFieldName) { + if (e.type() == Object) { + return _haveArrayAlongBucketDataPath( + e.embeddedObject(), next, isCompressed, depth + 1); + } else if (isCompressed && BinData == e.type()) { + // Unbucketing magic happens here for nested measurement fields (i.e. + // data.a.b) in compressed buckets. + BSONColumn column{e}; + for (const BSONElement& e2 : column) { + BSONObj embedded = e2.isABSONObj() ? e2.embeddedObject() : BSONObj(); + const bool foundArray = _handleElementForHaveArrayAlongBucketDataPath( + embedded, embedded.getField(next), next, depth); + if (foundArray) { + return foundArray; + } + } + } } } else { BSONElement e = obj.getField(path); if (Object == e.type()) { return _haveArrayAlongBucketDataPath( - e.embeddedObject(), StringData(), depth + 1); + e.embeddedObject(), StringData(), isCompressed, depth + 1); + } else if (BinData == e.type()) { + // Unbucketing magic happens here for top-level measurement fields (i.e. data.a) + // in compressed buckets. + invariant(isCompressed && depth == 1); + BSONColumn column{e}; + for (const BSONElement& e2 : column) { + if (e2.type() == Array) { + return true; + } + } } } return false; } case 2: { - // Unbucketing magic happens here. + // Unbucketing magic happens here for uncompressed buckets. + invariant(!isCompressed); for (const BSONElement& e : obj) { std::string subPath = e.fieldName(); if (!path.empty()) { @@ -367,24 +460,20 @@ Decision _fieldContainsArrayData(const BSONObj& min, const BSONObj& max, StringD } // namespace -void extractAllElementsAlongBucketPath(const BSONObj& obj, - StringData path, - BSONElementSet& elements, - bool expandArrayOnTrailingField, - MultikeyComponents* arrayComponents) { - constexpr BSONDepthIndex initialDepth = 0; - _extractAllElementsAlongBucketPath( - obj, path, elements, expandArrayOnTrailingField, initialDepth, arrayComponents); -} - -void extractAllElementsAlongBucketPath(const BSONObj& obj, - StringData path, - BSONElementMultiSet& elements, - bool expandArrayOnTrailingField, - MultikeyComponents* arrayComponents) { +boost::optional<BSONColumn> extractAllElementsAlongBucketPath(const BSONObj& obj, + StringData path, + BSONElementSet& elements, + bool expandArrayOnTrailingField, + MultikeyComponents* arrayComponents) { constexpr BSONDepthIndex initialDepth = 0; - _extractAllElementsAlongBucketPath( - obj, path, elements, expandArrayOnTrailingField, initialDepth, arrayComponents); + const bool isCompressed = timeseries::isCompressedBucket(obj); + return _extractAllElementsAlongBucketPath(obj, + path, + elements, + expandArrayOnTrailingField, + isCompressed, + initialDepth, + arrayComponents); } bool haveArrayAlongBucketDataPath(const BSONObj& bucketObj, StringData path) { @@ -394,7 +483,8 @@ bool haveArrayAlongBucketDataPath(const BSONObj& bucketObj, StringData path) { } constexpr BSONDepthIndex initialDepth = 0; - return _haveArrayAlongBucketDataPath(bucketObj, path, initialDepth); + const bool isCompressed = timeseries::isCompressedBucket(bucketObj); + return _haveArrayAlongBucketDataPath(bucketObj, path, isCompressed, initialDepth); } std::ostream& operator<<(std::ostream& s, const Decision& i) { diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support.h b/src/mongo/db/timeseries/timeseries_dotted_path_support.h index 266c86a9117..fa2ebba941b 100644 --- a/src/mongo/db/timeseries/timeseries_dotted_path_support.h +++ b/src/mongo/db/timeseries/timeseries_dotted_path_support.h @@ -29,10 +29,12 @@ #pragma once +#include "boost/any.hpp" #include <cstddef> #include "mongo/bson/bsonelement_comparator_interface.h" #include "mongo/bson/bsonobj.h" +#include "mongo/bson/util/bsoncolumn.h" #include "mongo/db/index/multikey_paths.h" namespace mongo { @@ -48,8 +50,15 @@ namespace dotted_path_support { * original document. Note that the caller should include the 'data' prefix, but omit the depth-2 * numeric path entry that results from pivoting the data into the bucketed format. * - * Other than the bucketing and unbucketing mentioned above, the function should be have roughly - * like `mongo::dotted_path_support::extractAllElementsAlongPath'. + * Other than the bucketing and unbucketing mentioned above, and the return value, the function + * should behave roughly like `mongo::dotted_path_support::extractAllElementsAlongPath'. + * + * In the case that the input bucket has been compressed, the function may need to decompress the + * data in order to examine it and extract the requested data. Since the data is returned as + * BSONElement, which does not own the data it references, we will need to provide storage for the + * decompressed data that will outlive the function call so that the returned BSONElements remain + * valid. This storage is provided via the optional return value. It need not be examined directly, + * but it should not be discarded until after the extracted elements. * * An example: * @@ -57,17 +66,12 @@ namespace dotted_path_support { * {b: 1} and {b: 2} would be added to the set. 'arrayComponents' would be set as * std::set<size_t>{1U}. */ -void extractAllElementsAlongBucketPath(const BSONObj& obj, - StringData path, - BSONElementSet& elements, - bool expandArrayOnTrailingField = true, - MultikeyComponents* arrayComponents = nullptr); - -void extractAllElementsAlongBucketPath(const BSONObj& obj, - StringData path, - BSONElementMultiSet& elements, - bool expandArrayOnTrailingField = true, - MultikeyComponents* arrayComponents = nullptr); +MONGO_WARN_UNUSED_RESULT_FUNCTION boost::optional<BSONColumn> extractAllElementsAlongBucketPath( + const BSONObj& obj, + StringData path, + BSONElementSet& elements, + bool expandArrayOnTrailingField = true, + MultikeyComponents* arrayComponents = nullptr); /** * Finds arrays in individual metrics along the specified data path. diff --git a/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp b/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp index 586b4361f29..638a2c8d6c4 100644 --- a/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp +++ b/src/mongo/db/timeseries/timeseries_dotted_path_support_test.cpp @@ -31,7 +31,9 @@ #include "mongo/platform/basic.h" #include "mongo/bson/bsonmisc.h" +#include "mongo/db/timeseries/bucket_compression.h" #include "mongo/db/timeseries/timeseries_dotted_path_support.h" +#include "mongo/unittest/bson_test_util.h" #include "mongo/unittest/unittest.h" namespace mongo { @@ -39,12 +41,32 @@ namespace { namespace tdps = ::mongo::timeseries::dotted_path_support; -TEST(TimeseriesDottedPathSupport, HaveArrayAlongBucketPath) { - BSONObj obj = ::mongo::fromjson(R"( +class TimeseriesDottedPathSupportTest : public unittest::Test { +protected: + void runTest(const BSONObj& obj, const std::function<void(const BSONObj&)>& test) { + test(obj); + + NamespaceString nss{"test"}; + auto compressionResult = timeseries::compressBucket(obj, "time", nss, true); + ASSERT_TRUE(compressionResult.compressedBucket.has_value()); + ASSERT_FALSE(compressionResult.decompressionFailed); + + test(compressionResult.compressedBucket.get()); + } +}; + +TEST_F(TimeseriesDottedPathSupportTest, HaveArrayAlongBucketPath) { + BSONObj input = ::mongo::fromjson(R"( { control: {version: 1}, bogus: [1], data: { + time: { + "0": {"$date": "1970-01-01T00:00:00.000Z"}, + "1": {"$date": "1970-01-01T00:00:00.001Z"}, + "3": {"$date": "1970-01-01T00:00:00.003Z"}, + "99": {"$date": "1970-01-01T00:00:00.099Z"} + }, a: {}, b: [], c: { @@ -70,31 +92,41 @@ TEST(TimeseriesDottedPathSupport, HaveArrayAlongBucketPath) { {a: true} ] } + }, + h: { + "1": { + a: { + b: true + } + } } } })"); - // Non-data fields should always be false - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "control")); - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "control.version")); - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "bogus")); + runTest(input, [this](const BSONObj& obj) { + // Non-data fields should always be false + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "control")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "control.version")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "bogus")); - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data")); - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.a")); - ASSERT_FALSE( - tdps::haveArrayAlongBucketDataPath(obj, "data.b")); // bucket expansion hides array - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.c")); - ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.d")); - ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.e")); - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.f")); - ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.f.a")); - ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.g")); - ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a")); - ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a.a")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.a")); + ASSERT_FALSE( + tdps::haveArrayAlongBucketDataPath(obj, "data.b")); // bucket expansion hides array + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.c")); + invariant(tdps::haveArrayAlongBucketDataPath(obj, "data.d")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.e")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.f")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.f.a")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.g")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a")); + ASSERT_TRUE(tdps::haveArrayAlongBucketDataPath(obj, "data.g.a.a")); + ASSERT_FALSE(tdps::haveArrayAlongBucketDataPath(obj, "data.h.a.b")); + }); } -TEST(TimeseriesDottedPathSupport, fieldContainsArrayData) { - BSONObj obj = ::mongo::fromjson(R"( +TEST_F(TimeseriesDottedPathSupportTest, fieldContainsArrayData) { + BSONObj input = ::mongo::fromjson(R"( { control: { min: { @@ -169,112 +201,213 @@ TEST(TimeseriesDottedPathSupport, fieldContainsArrayData) { g: true } } + }, + data: { + time: { + "0": {"$date": "1970-01-01T00:00:00.000Z"} + } } })"); - // Because this function is meant as an optimization to avoid a more expensive check, we need - // to ensure that we don't make wrong decisions, but we can always fall through to the more - // expensive check. So if the *best* decision that the function could make with the information - // present in the control fields is "Yes", we will accept "Yes" or "Maybe". Similarly if the - // best decision it could make is "No", we will accept "No" or "Maybe". If there isn't enough - // information in the control fields to make a firm decision, then it must return "Maybe". + runTest(input, [this](const BSONObj& obj) { + // Because this function is meant as an optimization to avoid a more expensive check, we + // need to ensure that we don't make wrong decisions, but we can always fall through to the + // more expensive check. So if the *best* decision that the function could make with the + // information present in the control fields is "Yes", we will accept "Yes" or "Maybe". + // Similarly if the best decision it could make is "No", we will accept "No" or "Maybe". If + // there isn't enough information in the control fields to make a firm decision, then it + // must return "Maybe". - // A few assumptions about type orders necessary to understand the below tests: - // eoo < double < array < bool + // A few assumptions about type orders necessary to understand the below tests: + // eoo < double < array < bool - constexpr auto yes = tdps::Decision::Yes; - constexpr auto no = tdps::Decision::No; - constexpr auto maybe = tdps::Decision::Maybe; + constexpr auto yes = tdps::Decision::Yes; + constexpr auto no = tdps::Decision::No; + constexpr auto maybe = tdps::Decision::Maybe; - // a: {min: double, max: bool}, - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "a")); + // a: {min: double, max: bool}, + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "a")); - // b: {min: bool, max: bool} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "b")); + // b: {min: bool, max: bool} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "b")); - // c: {min: double, max: double} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "c")); + // c: {min: double, max: double} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "c")); - // d: {min: double, max: array} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "d")); + // d: {min: double, max: array} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "d")); - // e: {min: array, max: bool} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "e")); + // e: {min: array, max: bool} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "e")); - // f: {min: array, max: array} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "f")); + // f: {min: array, max: array} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "f")); - // g: {min: double, max: object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "g")); - // g.a: {min: double.eoo, max: object.bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "g.a")); - // g.b: {min: double.eoo, max: object.double} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "g.b")); - // g.c: {min: double.eoo, max: object.array} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "g.c")); - // g.d: {min: double.eoo, max: object.eoo} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "g.d")); + // g: {min: double, max: object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "g")); + // g.a: {min: double.eoo, max: object.bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "g.a")); + // g.b: {min: double.eoo, max: object.double} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "g.b")); + // g.c: {min: double.eoo, max: object.array} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "g.c")); + // g.d: {min: double.eoo, max: object.eoo} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "g.d")); - // h: {min: object, max: bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h")); - // h.a: {min: object.bool, max: bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.a")); - // h.b: {min: object.double, max: bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.b")); - // h.c: {min: object.array, max: bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.c")); - // h.d: {min: object.eoo, max: bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.d")); + // h: {min: object, max: bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h")); + // h.a: {min: object.bool, max: bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.a")); + // h.b: {min: object.double, max: bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.b")); + // h.c: {min: object.array, max: bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.c")); + // h.d: {min: object.eoo, max: bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "h.d")); - // i: {min: object, max: object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i")); - // i.a: {min: object.double, max: object.bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.a")); - // i.b: {min: object.array, max: object.array} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "i.b")); - // i.c: {min: object.bool, max: object.bool} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.c")); - // i.d: {min: object.object, max: object.object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.d")); - // i.d.a: {min: object.object.double, max: object.object.bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.d.a")); - // i.e: {min: object.object, max: object.object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.e")); - // i.e.a: {min: object.object.double, max: object.object.object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.e.a")); - // i.f: {min: object.double, max: object.object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f")); - // i.f.a: {min: object.double.eoo, max: object.object.double} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.a")); - // i.f.b: {min: object.double.eoo, max: object.object.object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.b")); - // i.f.c: {min: object.double.eoo, max: object.object.object} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.c")); - // i.f.c.a: {min: object.double.eoo.eoo, max: object.object.object.double} - ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.c.a")); - // i.f.c.b: {min: object.double.eoo.eoo, max: object.object.object.bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.f.c.b")); - // i.f.d: {min: object.double.eoo, max: object.object.array} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "i.f.d")); - // i.f.e: {min: object.double.eoo, max: object.object.bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.f.e")); - // i.g: {min: object.object, max: object.bool} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g")); - // i.g.a: {min: object.object.double, max: object.bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.a")); - // i.g.b: {min: object.object.object, max: object.bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.b")); - // i.g.c: {min: object.object.object, max: object.bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.c")); - // i.g.c.a: {min: object.object.object.double, max: object.bool.eoo.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.c.a")); - // i.g.c.b: {min: object.object.object.bool, max: object.bool.eoo.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.c.b")); - // i.g.d: {min: object.object.array, max: object.bool.eoo} - ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "i.g.d")); - // i.g.e: {min: object.object.bool, max: object.bool.eoo} - ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.e")); + // i: {min: object, max: object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i")); + // i.a: {min: object.double, max: object.bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.a")); + // i.b: {min: object.array, max: object.array} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "i.b")); + // i.c: {min: object.bool, max: object.bool} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.c")); + // i.d: {min: object.object, max: object.object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.d")); + // i.d.a: {min: object.object.double, max: object.object.bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.d.a")); + // i.e: {min: object.object, max: object.object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.e")); + // i.e.a: {min: object.object.double, max: object.object.object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.e.a")); + // i.f: {min: object.double, max: object.object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f")); + // i.f.a: {min: object.double.eoo, max: object.object.double} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.a")); + // i.f.b: {min: object.double.eoo, max: object.object.object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.b")); + // i.f.c: {min: object.double.eoo, max: object.object.object} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.c")); + // i.f.c.a: {min: object.double.eoo.eoo, max: object.object.object.double} + ASSERT_NE(yes, tdps::fieldContainsArrayData(obj, "i.f.c.a")); + // i.f.c.b: {min: object.double.eoo.eoo, max: object.object.object.bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.f.c.b")); + // i.f.d: {min: object.double.eoo, max: object.object.array} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "i.f.d")); + // i.f.e: {min: object.double.eoo, max: object.object.bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.f.e")); + // i.g: {min: object.object, max: object.bool} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g")); + // i.g.a: {min: object.object.double, max: object.bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.a")); + // i.g.b: {min: object.object.object, max: object.bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.b")); + // i.g.c: {min: object.object.object, max: object.bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.c")); + // i.g.c.a: {min: object.object.object.double, max: object.bool.eoo.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.c.a")); + // i.g.c.b: {min: object.object.object.bool, max: object.bool.eoo.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.c.b")); + // i.g.d: {min: object.object.array, max: object.bool.eoo} + ASSERT_NE(no, tdps::fieldContainsArrayData(obj, "i.g.d")); + // i.g.e: {min: object.object.bool, max: object.bool.eoo} + ASSERT_EQ(maybe, tdps::fieldContainsArrayData(obj, "i.g.e")); + }); } +TEST_F(TimeseriesDottedPathSupportTest, ExtractAllElementsAlongBucketPath) { + BSONObj input = ::mongo::fromjson(R"( +{ + control: {version: 1}, + data: { + time: { + "0": {"$date": "1970-01-01T00:00:00.000Z"}, + "1": {"$date": "1970-01-01T00:00:00.001Z"}, + "3": {"$date": "1970-01-01T00:00:00.003Z"}, + "99": {"$date": "1970-01-01T00:00:00.099Z"} + }, + a: {}, + b: [], + c: { + "0": true, + "1": false + }, + d: { + "0": false, + "1": [] + }, + e: { + "3": "foo", + "99": [1, 2] + }, + f: { + "1": { + a: [true, false] + } + }, + g: { + "1": { + a: [ + {a: true} + ] + } + }, + h: { + "1": { + a: { + b: true + } + }, + "3": { + a: { + b: false + } + } + } + } +})"); + + runTest(input, [this](const BSONObj& obj) { + auto assertExtractionMatches = [&](StringData path, const BSONArray expectedStorage) { + BSONElementSet actual; + auto actualStorage = tdps::extractAllElementsAlongBucketPath(obj, path, actual); + + BSONElementSet expected; + for (auto&& el : expectedStorage) { + expected.emplace(el); + } + + ASSERT_EQ(actual.size(), expected.size()); + + auto actualIt = actual.begin(); + auto expectedIt = expected.begin(); + while (actualIt != actual.end() && expectedIt != expected.end()) { + ASSERT_FALSE(actualIt->eoo()); + ASSERT_FALSE(expectedIt->eoo()); + ASSERT_EQ(actualIt->woCompare(*expectedIt, 0), 0); + actualIt++; + expectedIt++; + } + }; + + assertExtractionMatches("data.a"_sd, BSONArray()); + assertExtractionMatches("data.b"_sd, BSONArray()); + assertExtractionMatches("data.c"_sd, BSON_ARRAY(true << false)); + assertExtractionMatches("data.d"_sd, BSON_ARRAY(false)); + assertExtractionMatches("data.e"_sd, BSON_ARRAY("foo" << 1 << 2)); + assertExtractionMatches("data.f"_sd, BSON_ARRAY(BSON("a" << BSON_ARRAY(true << false)))); + assertExtractionMatches("data.f.a"_sd, BSON_ARRAY(true << false)); + assertExtractionMatches("data.g"_sd, + BSON_ARRAY(BSON("a" << BSON_ARRAY(BSON("a" << true))))); + assertExtractionMatches("data.g.a"_sd, BSON_ARRAY(BSON("a" << true))); + assertExtractionMatches("data.g.a.a"_sd, BSON_ARRAY(true)); + assertExtractionMatches( + "data.h"_sd, + BSON_ARRAY(BSON("a" << BSON("b" << true)) << BSON("a" << BSON("b" << false)))); + assertExtractionMatches("data.h.a"_sd, BSON_ARRAY(BSON("b" << true) << BSON("b" << false))); + assertExtractionMatches("data.h.a.b"_sd, BSON_ARRAY(true << false)); + }); +} } // namespace } // namespace mongo |