SERVER-60672 Simpler pushdown when timeseries collection has no mixed-schema buckets

author: samontea <merciers.merciers@gmail.com> 2021-11-16 14:54:47 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-12-14 22:02:37 +0000
commit: 8fb1d8039361bcbf3a0ab77127cc6712783485ca (patch)
tree: 2681117e701eaffd05e48ea49f4f444044a320a5 /src
parent: 56d055c507a94a70e31a014142934060b76710ed (diff)
download: mongo-8fb1d8039361bcbf3a0ab77127cc6712783485ca.tar.gz
6 files changed, 114 insertions, 17 deletions
diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
index c6f353f3293..ed706683ba5 100644
--- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
+++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
@@ -220,8 +220,10 @@ bool fieldIsComputed(BucketSpec spec, std::string field) {
 DocumentSourceInternalUnpackBucket::DocumentSourceInternalUnpackBucket(
     const boost::intrusive_ptr<ExpressionContext>& expCtx,
     BucketUnpacker bucketUnpacker,
-    int bucketMaxSpanSeconds)
+    int bucketMaxSpanSeconds,
+    bool assumeNoMixedSchemaData)
     : DocumentSource(kStageNameInternal, expCtx),
+      _assumeNoMixedSchemaData(assumeNoMixedSchemaData),
       _bucketUnpacker(std::move(bucketUnpacker)),
       _bucketMaxSpanSeconds{bucketMaxSpanSeconds} {}
 
@@ -240,6 +242,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
     auto hasTimeField = false;
     auto hasBucketMaxSpanSeconds = false;
     auto bucketMaxSpanSeconds = 0;
+    auto assumeClean = false;
     std::vector<std::string> computedMetaProjFields;
     for (auto&& elem : specElem.embeddedObject()) {
         auto fieldName = elem.fieldNameStringData();
@@ -268,6 +271,11 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             unpackerBehavior = fieldName == kInclude ? BucketUnpacker::Behavior::kInclude
                                                      : BucketUnpacker::Behavior::kExclude;
             hasIncludeExclude = true;
+        } else if (fieldName == kAssumeNoMixedSchemaData) {
+            uassert(6067202,
+                    str::stream() << "assumeClean field must be a bool, got: " << elem.type(),
+                    elem.type() == BSONType::Bool);
+            assumeClean = elem.boolean();
         } else if (fieldName == timeseries::kTimeFieldName) {
             uassert(5346504,
                     str::stream() << "timeField field must be a string, got: " << elem.type(),
@@ -326,7 +334,10 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             hasBucketMaxSpanSeconds);
 
     return make_intrusive<DocumentSourceInternalUnpackBucket>(
-        expCtx, BucketUnpacker{std::move(bucketSpec), unpackerBehavior}, bucketMaxSpanSeconds);
+        expCtx,
+        BucketUnpacker{std::move(bucketSpec), unpackerBehavior},
+        bucketMaxSpanSeconds,
+        assumeClean);
 }
 
 boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createFromBsonExternal(
@@ -338,6 +349,7 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
 
     BucketSpec bucketSpec;
     auto hasTimeField = false;
+    auto assumeClean = false;
     for (auto&& elem : specElem.embeddedObject()) {
         auto fieldName = elem.fieldNameStringData();
         // We only expose "timeField" and "metaField" as parameters in $_unpackBucket.
@@ -356,6 +368,11 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
                     str::stream() << "metaField field must be a single-element field path",
                     metaField.find('.') == std::string::npos);
             bucketSpec.metaField = std::move(metaField);
+        } else if (fieldName == kAssumeNoMixedSchemaData) {
+            uassert(6067203,
+                    str::stream() << "assumeClean field must be a bool, got: " << elem.type(),
+                    elem.type() == BSONType::Bool);
+            assumeClean = elem.boolean();
         } else {
             uasserted(5612404,
                       str::stream() << "unrecognized parameter to $_unpackBucket: " << fieldName);
@@ -366,7 +383,10 @@ boost::intrusive_ptr<DocumentSource> DocumentSourceInternalUnpackBucket::createF
             hasTimeField);
 
     return make_intrusive<DocumentSourceInternalUnpackBucket>(
-        expCtx, BucketUnpacker{std::move(bucketSpec), BucketUnpacker::Behavior::kExclude}, 3600);
+        expCtx,
+        BucketUnpacker{std::move(bucketSpec), BucketUnpacker::Behavior::kExclude},
+        3600,
+        assumeClean);
 }
 
 void DocumentSourceInternalUnpackBucket::serializeToArray(
@@ -394,6 +414,8 @@ void DocumentSourceInternalUnpackBucket::serializeToArray(
         out.addField(timeseries::kMetaFieldName, Value{*spec.metaField});
     }
     out.addField(kBucketMaxSpanSeconds, Value{_bucketMaxSpanSeconds});
+    if (_assumeNoMixedSchemaData)
+        out.addField(kAssumeNoMixedSchemaData, Value(_assumeNoMixedSchemaData));
 
     if (!spec.computedMetaProjFields.empty())
         out.addField("computedMetaProjFields", Value{[&] {
@@ -545,12 +567,22 @@ std::pair<BSONObj, bool> DocumentSourceInternalUnpackBucket::extractOrBuildProje
  * Creates a predicate that ensures that if there exists a subpath of matchExprPath such that the
  * type of `control.min.subpath` is not the same as `control.max.subpath` then we will match that
  * document.
+ *
+ * However, if the buckets collection has no mixed-schema data then this type-equality predicate is
+ * unnecessary. In that case this function returns an empty, always-true predicate.
  */
 std::unique_ptr<MatchExpression> createTypeEqualityPredicate(
-    boost::intrusive_ptr<ExpressionContext> pExpCtx, const StringData& matchExprPath) {
+    boost::intrusive_ptr<ExpressionContext> pExpCtx,
+    const StringData& matchExprPath,
+    bool assumeNoMixedSchemaData) {
+
+    std::vector<std::unique_ptr<MatchExpression>> typeEqualityPredicates;
+
+    if (assumeNoMixedSchemaData)
+        return std::make_unique<OrMatchExpression>(std::move(typeEqualityPredicates));
+
     FieldPath matchExprField(matchExprPath);
     using namespace timeseries;
-    std::vector<std::unique_ptr<MatchExpression>> typeEqualityPredicates;
 
     // Assume that we're generating a predicate on "a.b"
     for (size_t i = 0; i < matchExprField.getPathLength(); i++) {
@@ -595,7 +627,8 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
     const BucketSpec& bucketSpec,
     int bucketMaxSpanSeconds,
     ExpressionContext::CollationMatchesDefault collationMatchesDefault,
-    boost::intrusive_ptr<ExpressionContext> pExpCtx) {
+    boost::intrusive_ptr<ExpressionContext> pExpCtx,
+    bool assumeNoMixedSchemaData) {
     using namespace timeseries;
     const auto matchExprPath = matchExpr->path();
     const auto matchExprData = matchExpr->getData();
@@ -684,7 +717,8 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
                                         minPath, matchExprData),
                                     MatchExprPredicate<InternalExprGTEMatchExpression>(
                                         maxPath, matchExprData)),
-                      createTypeEqualityPredicate(pExpCtx, matchExprPath)));
+                      createTypeEqualityPredicate(
+                          pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
 
         case MatchExpression::GT:
             // For $gt, make a $gt predicate against 'control.max'. In addition, if the comparison
@@ -706,7 +740,8 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
                                                                     bucketMaxSpanSeconds)))
                 : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
                       std::make_unique<InternalExprGTMatchExpression>(maxPath, matchExprData),
-                      createTypeEqualityPredicate(pExpCtx, matchExprPath)));
+                      createTypeEqualityPredicate(
+                          pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
 
         case MatchExpression::GTE:
             // For $gte, make a $gte predicate against 'control.max'. In addition, if the comparison
@@ -728,7 +763,8 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
                                                                      bucketMaxSpanSeconds)))
                 : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
                       std::make_unique<InternalExprGTEMatchExpression>(maxPath, matchExprData),
-                      createTypeEqualityPredicate(pExpCtx, matchExprPath)));
+                      createTypeEqualityPredicate(
+                          pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
 
         case MatchExpression::LT:
             // For $lt, make a $lt predicate against 'control.min'. In addition, if the comparison
@@ -748,7 +784,8 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
                                                                     bucketMaxSpanSeconds)))
                 : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
                       std::make_unique<InternalExprLTMatchExpression>(minPath, matchExprData),
-                      createTypeEqualityPredicate(pExpCtx, matchExprPath)));
+                      createTypeEqualityPredicate(
+                          pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
 
         case MatchExpression::LTE:
             // For $lte, make a $lte predicate against 'control.min'. In addition, if the comparison
@@ -768,7 +805,8 @@ std::unique_ptr<MatchExpression> createComparisonPredicate(
                                                                      bucketMaxSpanSeconds)))
                 : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
                       std::make_unique<InternalExprLTEMatchExpression>(minPath, matchExprData),
-                      createTypeEqualityPredicate(pExpCtx, matchExprPath)));
+                      createTypeEqualityPredicate(
+                          pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
 
         default:
             MONGO_UNREACHABLE_TASSERT(5348302);
@@ -797,7 +835,8 @@ DocumentSourceInternalUnpackBucket::createPredicatesOnBucketLevelField(
                                          _bucketUnpacker.bucketSpec(),
                                          _bucketMaxSpanSeconds,
                                          pExpCtx->collationMatchesDefault,
-                                         pExpCtx);
+                                         pExpCtx,
+                                         _assumeNoMixedSchemaData);
     } else if (matchExpr->matchType() == MatchExpression::GEO) {
         auto& geoExpr = static_cast<const GeoMatchExpression*>(matchExpr)->getGeoExpression();
         if (geoExpr.getPred() == GeoExpression::WITHIN ||
diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h
index 40e154c5478..c1b59078fae 100644
--- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h
+++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h
@@ -43,6 +43,7 @@ public:
     static constexpr StringData kStageNameExternal = "$_unpackBucket"_sd;
     static constexpr StringData kInclude = "include"_sd;
     static constexpr StringData kExclude = "exclude"_sd;
+    static constexpr StringData kAssumeNoMixedSchemaData = "assumeNoMixedSchemaData"_sd;
     static constexpr StringData kBucketMaxSpanSeconds = "bucketMaxSpanSeconds"_sd;
 
     static boost::intrusive_ptr<DocumentSource> createFromBsonInternal(
@@ -52,7 +53,8 @@ public:
 
     DocumentSourceInternalUnpackBucket(const boost::intrusive_ptr<ExpressionContext>& expCtx,
                                        BucketUnpacker bucketUnpacker,
-                                       int bucketMaxSpanSeconds);
+                                       int bucketMaxSpanSeconds,
+                                       bool assumeNoMixedSchemaData = false);
 
     const char* getSourceName() const override {
         return kStageNameInternal.rawData();
@@ -202,6 +204,10 @@ public:
 private:
     GetNextResult doGetNext() final;
 
+    // If buckets contained a mixed type schema along some path, we have to push down special
+    // predicates in order to ensure correctness.
+    bool _assumeNoMixedSchemaData = false;
+
     BucketUnpacker _bucketUnpacker;
     int _bucketMaxSpanSeconds;
 
diff --git a/src/mongo/db/views/SConscript b/src/mongo/db/views/SConscript
index 5ca16d561ed..38d2204fce9 100644
--- a/src/mongo/db/views/SConscript
+++ b/src/mongo/db/views/SConscript
@@ -27,6 +27,7 @@ env.Library(
     ],
     LIBDEPS=[
         '$BUILD_DIR/mongo/base',
+        '$BUILD_DIR/mongo/db/catalog/collection',
         '$BUILD_DIR/mongo/db/pipeline/aggregation',
         '$BUILD_DIR/mongo/db/query/collation/collator_factory_interface',
         '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
diff --git a/src/mongo/db/views/resolved_view.cpp b/src/mongo/db/views/resolved_view.cpp
index 9c754c559e7..c0435b8e924 100644
--- a/src/mongo/db/views/resolved_view.cpp
+++ b/src/mongo/db/views/resolved_view.cpp
@@ -72,15 +72,30 @@ ResolvedView ResolvedView::fromBSON(const BSONObj& commandResponseObj) {
         collationSpec = collationElt.embeddedObject().getOwned();
     }
 
+
+    boost::optional<bool> mixedSchema = boost::none;
+    if (auto mixedSchemaElem = viewDef[kTimeseriesMayContainMixedData]) {
+        uassert(6067204,
+                str::stream() << "view definition must have " << kTimeseriesMayContainMixedData
+                              << " of type bool or no such field",
+                mixedSchemaElem.type() == BSONType::Bool);
+
+        mixedSchema = boost::optional<bool>(mixedSchemaElem.boolean());
+    }
+
     return {NamespaceString(viewDef["ns"].valueStringData()),
             std::move(pipeline),
-            std::move(collationSpec)};
+            std::move(collationSpec),
+            std::move(mixedSchema)};
 }
 
 void ResolvedView::serialize(BSONObjBuilder* builder) const {
     BSONObjBuilder subObj(builder->subobjStart("resolvedView"));
     subObj.append("ns", _namespace.ns());
     subObj.append("pipeline", _pipeline);
+    // Only serialize if it doesn't contain mixed data.
+    if ((_timeseriesMayContainMixedData && !(*_timeseriesMayContainMixedData)))
+        subObj.append(kTimeseriesMayContainMixedData, *_timeseriesMayContainMixedData);
     if (!_defaultCollation.isEmpty()) {
         subObj.append("collation", _defaultCollation);
     }
@@ -125,6 +140,21 @@ AggregateCommandRequest ResolvedView::asExpandedViewAggregation(
         }
         resolvedPipeline[1] =
             BSON(DocumentSourceInternalConvertBucketIndexStats::kStageName << builder.obj());
+    } else if (resolvedPipeline.size() >= 1 &&
+               resolvedPipeline[0][DocumentSourceInternalUnpackBucket::kStageNameInternal] &&
+               serverGlobalParams.featureCompatibility.isGreaterThanOrEqualTo(
+                   multiversion::FeatureCompatibilityVersion::kVersion_5_2)) {
+        auto unpackStage = resolvedPipeline[0];
+
+        BSONObjBuilder builder;
+        for (const auto& elem :
+             unpackStage[DocumentSourceInternalUnpackBucket::kStageNameInternal].Obj()) {
+            builder.append(elem);
+        }
+        builder.append(DocumentSourceInternalUnpackBucket::kAssumeNoMixedSchemaData,
+                       ((_timeseriesMayContainMixedData && !(*_timeseriesMayContainMixedData))));
+        resolvedPipeline[0] =
+            BSON(DocumentSourceInternalUnpackBucket::kStageNameInternal << builder.obj());
     }
 
     AggregateCommandRequest expandedRequest{_namespace, resolvedPipeline};
diff --git a/src/mongo/db/views/resolved_view.h b/src/mongo/db/views/resolved_view.h
index 19cfefe82f9..d19642c8dd5 100644
--- a/src/mongo/db/views/resolved_view.h
+++ b/src/mongo/db/views/resolved_view.h
@@ -46,10 +46,12 @@ class ResolvedView final : public ErrorExtraInfo {
 public:
     ResolvedView(const NamespaceString& collectionNs,
                  std::vector<BSONObj> pipeline,
-                 BSONObj defaultCollation)
+                 BSONObj defaultCollation,
+                 boost::optional<bool> timeseriesMayContainMixedData = boost::none)
         : _namespace(collectionNs),
           _pipeline(std::move(pipeline)),
-          _defaultCollation(std::move(defaultCollation)) {}
+          _defaultCollation(std::move(defaultCollation)),
+          _timeseriesMayContainMixedData(timeseriesMayContainMixedData) {}
 
     static ResolvedView fromBSON(const BSONObj& commandResponseObj);
 
@@ -74,6 +76,7 @@ public:
 
     // ErrorExtraInfo API
     static constexpr auto code = ErrorCodes::CommandOnShardedViewNotSupportedOnMongod;
+    static constexpr StringData kTimeseriesMayContainMixedData = "timeseriesMayContainMixedData"_sd;
     void serialize(BSONObjBuilder* bob) const final;
     static std::shared_ptr<const ErrorExtraInfo> parse(const BSONObj&);
 
@@ -88,6 +91,8 @@ private:
     // that operations on the view which do not specify a collation inherit the default. Operations
     // on the view which specify any other collation fail with a user error.
     BSONObj _defaultCollation;
+
+    boost::optional<bool> _timeseriesMayContainMixedData;
 };
 
 }  // namespace mongo
diff --git a/src/mongo/db/views/view_catalog.cpp b/src/mongo/db/views/view_catalog.cpp
index 84e0f7b46a8..b043eb5a03d 100644
--- a/src/mongo/db/views/view_catalog.cpp
+++ b/src/mongo/db/views/view_catalog.cpp
@@ -740,6 +740,7 @@ StatusWith<ResolvedView> ViewCatalog::resolveView(
         std::vector<NamespaceString> dependencyChain{nss};
 
         int depth = 0;
+        boost::optional<bool> mixedData = boost::none;
         for (; depth < ViewGraph::kMaxViewDepth; depth++) {
             auto view =
                 _lookup(opCtx, *resolvedNss, ViewCatalogLookupBehavior::kValidateDurableViews);
@@ -761,10 +762,25 @@ StatusWith<ResolvedView> ViewCatalog::resolveView(
                 return StatusWith<ResolvedView>(
                     {*resolvedNss,
                      std::move(resolvedPipeline),
-                     collation ? std::move(collation.get()) : CollationSpec::kSimpleSpec});
+                     collation ? std::move(collation.get()) : CollationSpec::kSimpleSpec,
+                     mixedData});
             }
 
             resolvedNss = &view->viewOn();
+
+            if (view->timeseries()) {
+                auto tsCollection =
+                    CollectionCatalog::get(opCtx)->lookupCollectionByNamespace(opCtx, *resolvedNss);
+                tassert(6067201,
+                        str::stream() << "expected time-series buckets collection " << *resolvedNss
+                                      << " to exist",
+
+                        tsCollection);
+                mixedData = tsCollection
+                    ? tsCollection->getTimeseriesBucketsMayHaveMixedSchemaData()
+                    : false;
+            }
+
             dependencyChain.push_back(*resolvedNss);
             if (!collation) {
                 if (timeSeriesCollator) {
author	samontea <merciers.merciers@gmail.com>	2021-11-16 14:54:47 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-12-14 22:02:37 +0000
commit	8fb1d8039361bcbf3a0ab77127cc6712783485ca (patch)
tree	2681117e701eaffd05e48ea49f4f444044a320a5 /src
parent	56d055c507a94a70e31a014142934060b76710ed (diff)
download	mongo-8fb1d8039361bcbf3a0ab77127cc6712783485ca.tar.gz