summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Percy <david.percy@mongodb.com>2021-11-28 23:55:13 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-21 04:24:53 +0000
commite446bef952d2ef42ae799e9b88c877b3fe0be6e4 (patch)
tree0aa308188de3d7160e0a94b71268f9a80aa834e3
parent1a47e71ba861449fffb7feeb05f79c2f1d99497c (diff)
downloadmongo-e446bef952d2ef42ae799e9b88c877b3fe0be6e4.tar.gz
SERVER-59163 Move time-series predicate rewrites to bucket_unpacker
We will want to use these rewrites when creating a partial index. But to avoid a cyclic dependency, we don't want to depend on all of the 'pipeline' library.
-rw-r--r--src/mongo/db/exec/SConscript1
-rw-r--r--src/mongo/db/exec/bucket_unpacker.cpp390
-rw-r--r--src/mongo/db/exec/bucket_unpacker.h29
-rw-r--r--src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp373
-rw-r--r--src/mongo/db/pipeline/document_source_internal_unpack_bucket.h25
5 files changed, 433 insertions, 385 deletions
diff --git a/src/mongo/db/exec/SConscript b/src/mongo/db/exec/SConscript
index 88655a990db..1a920757dad 100644
--- a/src/mongo/db/exec/SConscript
+++ b/src/mongo/db/exec/SConscript
@@ -57,6 +57,7 @@ env.Library(
"bucket_unpacker.cpp",
],
LIBDEPS = [
+ "$BUILD_DIR/mongo/db/matcher/expressions",
"document_value/document_value",
],
LIBDEPS_PRIVATE = [
diff --git a/src/mongo/db/exec/bucket_unpacker.cpp b/src/mongo/db/exec/bucket_unpacker.cpp
index 782d2ff5235..f663b49f46d 100644
--- a/src/mongo/db/exec/bucket_unpacker.cpp
+++ b/src/mongo/db/exec/bucket_unpacker.cpp
@@ -30,9 +30,399 @@
#include "mongo/db/exec/bucket_unpacker.h"
#include "mongo/bson/util/bsoncolumn.h"
+#include "mongo/db/matcher/expression.h"
+#include "mongo/db/matcher/expression_algo.h"
+#include "mongo/db/matcher/expression_expr.h"
+#include "mongo/db/matcher/expression_geo.h"
+#include "mongo/db/matcher/expression_internal_bucket_geo_within.h"
+#include "mongo/db/matcher/expression_internal_expr_comparison.h"
+#include "mongo/db/matcher/expression_tree.h"
+#include "mongo/db/pipeline/expression.h"
#include "mongo/db/timeseries/timeseries_constants.h"
namespace mongo {
+
+bool BucketSpec::fieldIsComputed(StringData field) const {
+ return std::any_of(computedMetaProjFields.begin(), computedMetaProjFields.end(), [&](auto& s) {
+ return s == field || expression::isPathPrefixOf(field, s) ||
+ expression::isPathPrefixOf(s, field);
+ });
+}
+
+namespace {
+
+/**
+ * Creates an ObjectId initialized with an appropriate timestamp corresponding to 'rhs' and
+ * returns it as a Value.
+ */
+template <typename MatchType>
+auto constructObjectIdValue(const BSONElement& rhs, int bucketMaxSpanSeconds) {
+ // Indicates whether to initialize an ObjectId with a max or min value for the non-date bytes.
+ enum class OIDInit : bool { max, min };
+ // Make an ObjectId cooresponding to a date value. As a conversion from date to ObjectId will
+ // truncate milliseconds, we round up when needed to prevent missing results.
+ auto makeDateOID = [](auto&& date, auto&& maxOrMin, bool roundMillisUpToSecond = false) {
+ if (roundMillisUpToSecond && (date.toMillisSinceEpoch() % 1000 != 0)) {
+ date += Seconds{1};
+ }
+
+ auto oid = OID{};
+ oid.init(date, maxOrMin == OIDInit::max);
+ return oid;
+ };
+ // Make an ObjectId cooresponding to a date value adjusted by the max bucket value for the
+ // time series view that this query operates on. This predicate can be used in a comparison
+ // to gauge a max value for a given bucket, rather than a min value.
+ auto makeMaxAdjustedDateOID = [&](auto&& date, auto&& maxOrMin) {
+ // Ensure we don't underflow.
+ if (date.toDurationSinceEpoch() >= Seconds{bucketMaxSpanSeconds})
+ // Subtract max bucket range.
+ return makeDateOID(date - Seconds{bucketMaxSpanSeconds}, maxOrMin);
+ else
+ // Since we're out of range, just make a predicate that is true for all date types.
+ return makeDateOID(Date_t::min(), OIDInit::min);
+ };
+ // An ObjectId consists of a 4-byte timestamp, as well as a unique value and a counter, thus
+ // two ObjectIds initialized with the same date will have different values. To ensure that we
+ // do not incorrectly include or exclude any buckets, depending on the operator we will
+ // construct either the largest or the smallest ObjectId possible with the corresponding date.
+ // If the query operand is not of type Date, the original query will not match on any documents
+ // because documents in a time-series collection must have a timeField of type Date. We will
+ // make this case faster by keeping the ObjectId as the lowest or highest possible value so as
+ // to eliminate all buckets.
+ if constexpr (std::is_same_v<MatchType, LTMatchExpression>) {
+ return Value{makeDateOID(rhs.date(), OIDInit::min, true /*roundMillisUpToSecond*/)};
+ } else if constexpr (std::is_same_v<MatchType, LTEMatchExpression>) {
+ return Value{makeDateOID(rhs.date(), OIDInit::max, true /*roundMillisUpToSecond*/)};
+ } else if constexpr (std::is_same_v<MatchType, GTMatchExpression>) {
+ return Value{makeMaxAdjustedDateOID(rhs.date(), OIDInit::max)};
+ } else if constexpr (std::is_same_v<MatchType, GTEMatchExpression>) {
+ return Value{makeMaxAdjustedDateOID(rhs.date(), OIDInit::min)};
+ }
+ MONGO_UNREACHABLE_TASSERT(5756800);
+}
+
+/*
+ * Creates a predicate that ensures that if there exists a subpath of matchExprPath such that the
+ * type of `control.min.subpath` is not the same as `control.max.subpath` then we will match that
+ * document.
+ *
+ * However, if the buckets collection has no mixed-schema data then this type-equality predicate is
+ * unnecessary. In that case this function returns an empty, always-true predicate.
+ */
+std::unique_ptr<MatchExpression> createTypeEqualityPredicate(
+ boost::intrusive_ptr<ExpressionContext> pExpCtx,
+ const StringData& matchExprPath,
+ bool assumeNoMixedSchemaData) {
+
+ std::vector<std::unique_ptr<MatchExpression>> typeEqualityPredicates;
+
+ if (assumeNoMixedSchemaData)
+ return std::make_unique<OrMatchExpression>(std::move(typeEqualityPredicates));
+
+ FieldPath matchExprField(matchExprPath);
+ using namespace timeseries;
+
+ // Assume that we're generating a predicate on "a.b"
+ for (size_t i = 0; i < matchExprField.getPathLength(); i++) {
+ auto minPath = std::string{kControlMinFieldNamePrefix} + matchExprField.getSubpath(i);
+ auto maxPath = std::string{kControlMaxFieldNamePrefix} + matchExprField.getSubpath(i);
+
+ // This whole block adds
+ // {$expr: {$ne: [{$type: "$control.min.a"}, {$type: "$control.max.a"}]}}
+ // in order to ensure that the type of `control.min.a` and `control.max.a` are the same.
+
+ // This produces {$expr: ... }
+ typeEqualityPredicates.push_back(std::make_unique<ExprMatchExpression>(
+ // This produces {$ne: ... }
+ make_intrusive<ExpressionCompare>(
+ pExpCtx.get(),
+ ExpressionCompare::CmpOp::NE,
+ // This produces [...]
+ makeVector<boost::intrusive_ptr<Expression>>(
+ // This produces {$type: ... }
+ make_intrusive<ExpressionType>(
+ pExpCtx.get(),
+ // This produces [...]
+ makeVector<boost::intrusive_ptr<Expression>>(
+ // This produces "$control.min.a"
+ ExpressionFieldPath::createPathFromString(
+ pExpCtx.get(), minPath, pExpCtx->variablesParseState))),
+ // This produces {$type: ... }
+ make_intrusive<ExpressionType>(
+ pExpCtx.get(),
+ // This produces [...]
+ makeVector<boost::intrusive_ptr<Expression>>(
+ // This produces "$control.max.a"
+ ExpressionFieldPath::createPathFromString(
+ pExpCtx.get(), maxPath, pExpCtx->variablesParseState))))),
+ pExpCtx));
+ }
+ return std::make_unique<OrMatchExpression>(std::move(typeEqualityPredicates));
+}
+
+std::unique_ptr<MatchExpression> createComparisonPredicate(
+ const ComparisonMatchExpressionBase* matchExpr,
+ const BucketSpec& bucketSpec,
+ int bucketMaxSpanSeconds,
+ ExpressionContext::CollationMatchesDefault collationMatchesDefault,
+ boost::intrusive_ptr<ExpressionContext> pExpCtx,
+ bool assumeNoMixedSchemaData) {
+ using namespace timeseries;
+ const auto matchExprPath = matchExpr->path();
+ const auto matchExprData = matchExpr->getData();
+
+ // The control field's min and max are chosen using a field-order insensitive comparator, while
+ // MatchExpressions use a comparator that treats field-order as significant. Because of this we
+ // will not perform this optimization on queries with operands of compound types.
+ if (matchExprData.type() == BSONType::Object || matchExprData.type() == BSONType::Array)
+ return nullptr;
+
+ // MatchExpressions have special comparison semantics regarding null, in that {$eq: null} will
+ // match all documents where the field is either null or missing. Because this is different
+ // from both the comparison semantics that InternalExprComparison expressions and the control's
+ // min and max fields use, we will not perform this optimization on queries with null operands.
+ if (matchExprData.type() == BSONType::jstNULL)
+ return nullptr;
+
+ // The control field's min and max are chosen based on the collation of the collection. If the
+ // query's collation does not match the collection's collation and the query operand is a
+ // string or compound type (skipped above) we will not perform this optimization.
+ if (collationMatchesDefault == ExpressionContext::CollationMatchesDefault::kNo &&
+ matchExprData.type() == BSONType::String) {
+ return nullptr;
+ }
+
+ // We must avoid mapping predicates on the meta field onto the control field.
+ if (bucketSpec.metaField() &&
+ (matchExprPath == bucketSpec.metaField().get() ||
+ expression::isPathPrefixOf(bucketSpec.metaField().get(), matchExprPath)))
+ return nullptr;
+
+ // We must avoid mapping predicates on fields computed via $addFields or a computed $project.
+ if (bucketSpec.fieldIsComputed(matchExprPath.toString())) {
+ return nullptr;
+ }
+
+ const auto isTimeField = (matchExprPath == bucketSpec.timeField());
+ if (isTimeField && matchExprData.type() != BSONType::Date) {
+ // Users are not allowed to insert non-date measurements into time field. So this query
+ // would not match anything. We do not need to optimize for this case.
+ return nullptr;
+ }
+
+ BSONObj minTime;
+ BSONObj maxTime;
+ if (isTimeField) {
+ auto timeField = matchExprData.Date();
+ minTime = BSON("" << timeField - Seconds(bucketMaxSpanSeconds));
+ maxTime = BSON("" << timeField + Seconds(bucketMaxSpanSeconds));
+ }
+
+ auto minPath = std::string{kControlMinFieldNamePrefix} + matchExprPath;
+ auto maxPath = std::string{kControlMaxFieldNamePrefix} + matchExprPath;
+
+ switch (matchExpr->matchType()) {
+ case MatchExpression::EQ:
+ case MatchExpression::INTERNAL_EXPR_EQ:
+ // For $eq, make both a $lte against 'control.min' and a $gte predicate against
+ // 'control.max'.
+ //
+ // If the comparison is against the 'time' field, include a predicate against the _id
+ // field which is converted to the maximum for the corresponding range of ObjectIds and
+ // is adjusted by the max range for a bucket to approximate the max bucket value given
+ // the min. Also include a predicate against the _id field which is converted to the
+ // minimum for the range of ObjectIds corresponding to the given date. In
+ // addition, we include a {'control.min' : {$gte: 'time - bucketMaxSpanSeconds'}} and
+ // a {'control.max' : {$lte: 'time + bucketMaxSpanSeconds'}} predicate which will be
+ // helpful in reducing bounds for index scans on 'time' field and routing on mongos.
+ //
+ // The same procedure applies to aggregation expressions of the form
+ // {$expr: {$eq: [...]}} that can be rewritten to use $_internalExprEq.
+ return isTimeField
+ ? makePredicate(
+ MatchExprPredicate<InternalExprLTEMatchExpression>(minPath, matchExprData),
+ MatchExprPredicate<InternalExprGTEMatchExpression>(minPath,
+ minTime.firstElement()),
+ MatchExprPredicate<InternalExprGTEMatchExpression>(maxPath, matchExprData),
+ MatchExprPredicate<InternalExprLTEMatchExpression>(maxPath,
+ maxTime.firstElement()),
+ MatchExprPredicate<LTEMatchExpression, Value>(
+ kBucketIdFieldName,
+ constructObjectIdValue<LTEMatchExpression>(matchExprData,
+ bucketMaxSpanSeconds)),
+ MatchExprPredicate<GTEMatchExpression, Value>(
+ kBucketIdFieldName,
+ constructObjectIdValue<GTEMatchExpression>(matchExprData,
+ bucketMaxSpanSeconds)))
+ : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
+ makePredicate(MatchExprPredicate<InternalExprLTEMatchExpression>(
+ minPath, matchExprData),
+ MatchExprPredicate<InternalExprGTEMatchExpression>(
+ maxPath, matchExprData)),
+ createTypeEqualityPredicate(
+ pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
+
+ case MatchExpression::GT:
+ case MatchExpression::INTERNAL_EXPR_GT:
+ // For $gt, make a $gt predicate against 'control.max'. In addition, if the comparison
+ // is against the 'time' field, include a predicate against the _id field which is
+ // converted to the maximum for the corresponding range of ObjectIds and is adjusted
+ // by the max range for a bucket to approximate the max bucket value given the min. In
+ // addition, we include a {'control.min' : {$gt: 'time - bucketMaxSpanSeconds'}}
+ // predicate which will be helpful in reducing bounds for index scans on 'time' field
+ // and routing on mongos.
+ //
+ // The same procedure applies to aggregation expressions of the form
+ // {$expr: {$gt: [...]}} that can be rewritten to use $_internalExprGt.
+ return isTimeField
+ ? makePredicate(
+ MatchExprPredicate<InternalExprGTMatchExpression>(maxPath, matchExprData),
+ MatchExprPredicate<InternalExprGTMatchExpression>(minPath,
+ minTime.firstElement()),
+ MatchExprPredicate<GTMatchExpression, Value>(
+ kBucketIdFieldName,
+ constructObjectIdValue<GTMatchExpression>(matchExprData,
+ bucketMaxSpanSeconds)))
+ : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
+ std::make_unique<InternalExprGTMatchExpression>(maxPath, matchExprData),
+ createTypeEqualityPredicate(
+ pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
+
+ case MatchExpression::GTE:
+ case MatchExpression::INTERNAL_EXPR_GTE:
+ // For $gte, make a $gte predicate against 'control.max'. In addition, if the comparison
+ // is against the 'time' field, include a predicate against the _id field which is
+ // converted to the minimum for the corresponding range of ObjectIds and is adjusted
+ // by the max range for a bucket to approximate the max bucket value given the min. In
+ // addition, we include a {'control.min' : {$gte: 'time - bucketMaxSpanSeconds'}}
+ // predicate which will be helpful in reducing bounds for index scans on 'time' field
+ // and routing on mongos.
+ //
+ // The same procedure applies to aggregation expressions of the form
+ // {$expr: {$gte: [...]}} that can be rewritten to use $_internalExprGte.
+ return isTimeField
+ ? makePredicate(
+ MatchExprPredicate<InternalExprGTEMatchExpression>(maxPath, matchExprData),
+ MatchExprPredicate<InternalExprGTEMatchExpression>(minPath,
+ minTime.firstElement()),
+ MatchExprPredicate<GTEMatchExpression, Value>(
+ kBucketIdFieldName,
+ constructObjectIdValue<GTEMatchExpression>(matchExprData,
+ bucketMaxSpanSeconds)))
+ : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
+ std::make_unique<InternalExprGTEMatchExpression>(maxPath, matchExprData),
+ createTypeEqualityPredicate(
+ pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
+
+ case MatchExpression::LT:
+ case MatchExpression::INTERNAL_EXPR_LT:
+ // For $lt, make a $lt predicate against 'control.min'. In addition, if the comparison
+ // is against the 'time' field, include a predicate against the _id field which is
+ // converted to the minimum for the corresponding range of ObjectIds. In
+ // addition, we include a {'control.max' : {$lt: 'time + bucketMaxSpanSeconds'}}
+ // predicate which will be helpful in reducing bounds for index scans on 'time' field
+ // and routing on mongos.
+ //
+ // The same procedure applies to aggregation expressions of the form
+ // {$expr: {$lt: [...]}} that can be rewritten to use $_internalExprLt.
+ return isTimeField
+ ? makePredicate(
+ MatchExprPredicate<InternalExprLTMatchExpression>(minPath, matchExprData),
+ MatchExprPredicate<InternalExprLTMatchExpression>(maxPath,
+ maxTime.firstElement()),
+ MatchExprPredicate<LTMatchExpression, Value>(
+ kBucketIdFieldName,
+ constructObjectIdValue<LTMatchExpression>(matchExprData,
+ bucketMaxSpanSeconds)))
+ : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
+ std::make_unique<InternalExprLTMatchExpression>(minPath, matchExprData),
+ createTypeEqualityPredicate(
+ pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
+
+ case MatchExpression::LTE:
+ case MatchExpression::INTERNAL_EXPR_LTE:
+ // For $lte, make a $lte predicate against 'control.min'. In addition, if the comparison
+ // is against the 'time' field, include a predicate against the _id field which is
+ // converted to the maximum for the corresponding range of ObjectIds. In
+ // addition, we include a {'control.max' : {$lte: 'time + bucketMaxSpanSeconds'}}
+ // predicate which will be helpful in reducing bounds for index scans on 'time' field
+ // and routing on mongos.
+ //
+ // The same procedure applies to aggregation expressions of the form
+ // {$expr: {$lte: [...]}} that can be rewritten to use $_internalExprLte.
+ return isTimeField
+ ? makePredicate(
+ MatchExprPredicate<InternalExprLTEMatchExpression>(minPath, matchExprData),
+ MatchExprPredicate<InternalExprLTEMatchExpression>(maxPath,
+ maxTime.firstElement()),
+ MatchExprPredicate<LTEMatchExpression, Value>(
+ kBucketIdFieldName,
+ constructObjectIdValue<LTEMatchExpression>(matchExprData,
+ bucketMaxSpanSeconds)))
+ : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
+ std::make_unique<InternalExprLTEMatchExpression>(minPath, matchExprData),
+ createTypeEqualityPredicate(
+ pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
+
+ default:
+ MONGO_UNREACHABLE_TASSERT(5348302);
+ }
+
+ MONGO_UNREACHABLE_TASSERT(5348303);
+}
+
+} // namespace
+
+std::unique_ptr<MatchExpression> BucketSpec::createPredicatesOnBucketLevelField(
+ const MatchExpression* matchExpr,
+ const BucketSpec& bucketSpec,
+ int bucketMaxSpanSeconds,
+ ExpressionContext::CollationMatchesDefault collationMatchesDefault,
+ const boost::intrusive_ptr<ExpressionContext>& pExpCtx,
+ bool assumeNoMixedSchemaData) {
+ if (matchExpr->matchType() == MatchExpression::AND) {
+ auto nextAnd = static_cast<const AndMatchExpression*>(matchExpr);
+ auto andMatchExpr = std::make_unique<AndMatchExpression>();
+
+ for (size_t i = 0; i < nextAnd->numChildren(); i++) {
+ if (auto child = createPredicatesOnBucketLevelField(
+ nextAnd->getChild(i),
+ bucketSpec,
+ bucketMaxSpanSeconds,
+ collationMatchesDefault,
+ pExpCtx,
+ assumeNoMixedSchemaData)) {
+ andMatchExpr->add(std::move(child));
+ }
+ }
+ if (andMatchExpr->numChildren() == 1) {
+ return andMatchExpr->releaseChild(0);
+ }
+ if (andMatchExpr->numChildren() > 0) {
+ return andMatchExpr;
+ }
+ } else if (ComparisonMatchExpression::isComparisonMatchExpression(matchExpr) ||
+ ComparisonMatchExpressionBase::isInternalExprComparison(matchExpr->matchType())) {
+ return createComparisonPredicate(
+ static_cast<const ComparisonMatchExpressionBase*>(matchExpr),
+ bucketSpec,
+ bucketMaxSpanSeconds,
+ pExpCtx->collationMatchesDefault,
+ pExpCtx,
+ assumeNoMixedSchemaData);
+ } else if (matchExpr->matchType() == MatchExpression::GEO) {
+ auto& geoExpr = static_cast<const GeoMatchExpression*>(matchExpr)->getGeoExpression();
+ if (geoExpr.getPred() == GeoExpression::WITHIN ||
+ geoExpr.getPred() == GeoExpression::INTERSECT) {
+ return std::make_unique<InternalBucketGeoWithinMatchExpression>(
+ geoExpr.getGeometryPtr(), geoExpr.getField());
+ }
+ }
+
+ return nullptr;
+}
+
class BucketUnpacker::UnpackingImpl {
public:
UnpackingImpl() = default;
diff --git a/src/mongo/db/exec/bucket_unpacker.h b/src/mongo/db/exec/bucket_unpacker.h
index 2f5fd70b785..05223ace00d 100644
--- a/src/mongo/db/exec/bucket_unpacker.h
+++ b/src/mongo/db/exec/bucket_unpacker.h
@@ -35,6 +35,8 @@
#include "mongo/bson/bsonobj.h"
#include "mongo/db/exec/document_value/document.h"
+#include "mongo/db/matcher/expression.h"
+#include "mongo/db/pipeline/expression_context.h"
namespace mongo {
/**
@@ -64,6 +66,33 @@ public:
const boost::optional<std::string>& metaField() const;
boost::optional<HashedFieldName> metaFieldHashed() const;
+ // Returns whether 'field' depends on a pushed down $addFields or computed $project.
+ bool fieldIsComputed(StringData field) const;
+
+ /**
+ * Takes a predicate after $_internalUnpackBucket on a bucketed field as an argument and
+ * attempts to map it to a new predicate on the 'control' field. For example, the predicate
+ * {a: {$gt: 5}} will generate the predicate {control.max.a: {$_internalExprGt: 5}}, which will
+ * be added before the $_internalUnpackBucket stage.
+ *
+ * If the original predicate is on the bucket's timeField we may also create a new predicate
+ * on the '_id' field to assist in index utilization. For example, the predicate
+ * {time: {$lt: new Date(...)}} will generate the following predicate:
+ * {$and: [
+ * {_id: {$lt: ObjectId(...)}},
+ * {control.min.time: {$_internalExprLt: new Date(...)}}
+ * ]}
+ *
+ * If the provided predicate is ineligible for this mapping, the function will return a nullptr.
+ */
+ static std::unique_ptr<MatchExpression> createPredicatesOnBucketLevelField(
+ const MatchExpression* matchExpr,
+ const BucketSpec& bucketSpec,
+ int bucketMaxSpanSeconds,
+ ExpressionContext::CollationMatchesDefault collationMatchesDefault,
+ const boost::intrusive_ptr<ExpressionContext>& pExpCtx,
+ bool assumeNoMixedSchemaData);
+
// The set of field names in the data region that should be included or excluded.
std::set<std::string> fieldSet;
diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
index 962129e7545..055b7be6fa6 100644
--- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
+++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.cpp
@@ -108,57 +108,6 @@ auto getIncludeExcludeProjectAndType(DocumentSource* src) {
}
/**
- * Creates an ObjectId initialized with an appropriate timestamp corresponding to 'rhs' and
- * returns it as a Value.
- */
-template <typename MatchType>
-auto constructObjectIdValue(const BSONElement& rhs, int bucketMaxSpanSeconds) {
- // Indicates whether to initialize an ObjectId with a max or min value for the non-date bytes.
- enum class OIDInit : bool { max, min };
- // Make an ObjectId cooresponding to a date value. As a conversion from date to ObjectId will
- // truncate milliseconds, we round up when needed to prevent missing results.
- auto makeDateOID = [](auto&& date, auto&& maxOrMin, bool roundMillisUpToSecond = false) {
- if (roundMillisUpToSecond && (date.toMillisSinceEpoch() % 1000 != 0)) {
- date += Seconds{1};
- }
-
- auto oid = OID{};
- oid.init(date, maxOrMin == OIDInit::max);
- return oid;
- };
- // Make an ObjectId cooresponding to a date value adjusted by the max bucket value for the
- // time series view that this query operates on. This predicate can be used in a comparison
- // to gauge a max value for a given bucket, rather than a min value.
- auto makeMaxAdjustedDateOID = [&](auto&& date, auto&& maxOrMin) {
- // Ensure we don't underflow.
- if (date.toDurationSinceEpoch() >= Seconds{bucketMaxSpanSeconds})
- // Subtract max bucket range.
- return makeDateOID(date - Seconds{bucketMaxSpanSeconds}, maxOrMin);
- else
- // Since we're out of range, just make a predicate that is true for all date types.
- return makeDateOID(Date_t::min(), OIDInit::min);
- };
- // An ObjectId consists of a 4-byte timestamp, as well as a unique value and a counter, thus
- // two ObjectIds initialized with the same date will have different values. To ensure that we
- // do not incorrectly include or exclude any buckets, depending on the operator we will
- // construct either the largest or the smallest ObjectId possible with the corresponding date.
- // If the query operand is not of type Date, the original query will not match on any documents
- // because documents in a time-series collection must have a timeField of type Date. We will
- // make this case faster by keeping the ObjectId as the lowest or highest possible value so as
- // to eliminate all buckets.
- if constexpr (std::is_same_v<MatchType, LTMatchExpression>) {
- return Value{makeDateOID(rhs.date(), OIDInit::min, true /*roundMillisUpToSecond*/)};
- } else if constexpr (std::is_same_v<MatchType, LTEMatchExpression>) {
- return Value{makeDateOID(rhs.date(), OIDInit::max, true /*roundMillisUpToSecond*/)};
- } else if constexpr (std::is_same_v<MatchType, GTMatchExpression>) {
- return Value{makeMaxAdjustedDateOID(rhs.date(), OIDInit::max)};
- } else if constexpr (std::is_same_v<MatchType, GTEMatchExpression>) {
- return Value{makeMaxAdjustedDateOID(rhs.date(), OIDInit::min)};
- }
- MONGO_UNREACHABLE_TASSERT(5756800);
-}
-
-/**
* Checks if a sort stage's pattern following our internal unpack bucket is suitable to be reordered
* before us. The sort stage must refer exclusively to the meta field or any subfields.
*/
@@ -211,15 +160,6 @@ void optimizePrefix(Pipeline::SourceContainer::iterator itr, Pipeline::SourceCon
container->splice(itr, prefix);
}
-// Returns whether 'field' depends on a pushed down $addFields or computed $project.
-bool fieldIsComputed(BucketSpec spec, std::string field) {
- return std::any_of(
- spec.computedMetaProjFields.begin(), spec.computedMetaProjFields.end(), [&](auto& s) {
- return s == field || expression::isPathPrefixOf(field, s) ||
- expression::isPathPrefixOf(s, field);
- });
-}
-
} // namespace
DocumentSourceInternalUnpackBucket::DocumentSourceInternalUnpackBucket(
@@ -568,313 +508,15 @@ std::pair<BSONObj, bool> DocumentSourceInternalUnpackBucket::extractOrBuildProje
return {BSONObj{}, false};
}
-/*
- * Creates a predicate that ensures that if there exists a subpath of matchExprPath such that the
- * type of `control.min.subpath` is not the same as `control.max.subpath` then we will match that
- * document.
- *
- * However, if the buckets collection has no mixed-schema data then this type-equality predicate is
- * unnecessary. In that case this function returns an empty, always-true predicate.
- */
-std::unique_ptr<MatchExpression> createTypeEqualityPredicate(
- boost::intrusive_ptr<ExpressionContext> pExpCtx,
- const StringData& matchExprPath,
- bool assumeNoMixedSchemaData) {
-
- std::vector<std::unique_ptr<MatchExpression>> typeEqualityPredicates;
-
- if (assumeNoMixedSchemaData)
- return std::make_unique<OrMatchExpression>(std::move(typeEqualityPredicates));
-
- FieldPath matchExprField(matchExprPath);
- using namespace timeseries;
-
- // Assume that we're generating a predicate on "a.b"
- for (size_t i = 0; i < matchExprField.getPathLength(); i++) {
- auto minPath = std::string{kControlMinFieldNamePrefix} + matchExprField.getSubpath(i);
- auto maxPath = std::string{kControlMaxFieldNamePrefix} + matchExprField.getSubpath(i);
-
- // This whole block adds
- // {$expr: {$ne: [{$type: "$control.min.a"}, {$type: "$control.max.a"}]}}
- // in order to ensure that the type of `control.min.a` and `control.max.a` are the same.
-
- // This produces {$expr: ... }
- typeEqualityPredicates.push_back(std::make_unique<ExprMatchExpression>(
- // This produces {$ne: ... }
- make_intrusive<ExpressionCompare>(
- pExpCtx.get(),
- ExpressionCompare::CmpOp::NE,
- // This produces [...]
- makeVector<boost::intrusive_ptr<Expression>>(
- // This produces {$type: ... }
- make_intrusive<ExpressionType>(
- pExpCtx.get(),
- // This produces [...]
- makeVector<boost::intrusive_ptr<Expression>>(
- // This produces "$control.min.a"
- ExpressionFieldPath::createPathFromString(
- pExpCtx.get(), minPath, pExpCtx->variablesParseState))),
- // This produces {$type: ... }
- make_intrusive<ExpressionType>(
- pExpCtx.get(),
- // This produces [...]
- makeVector<boost::intrusive_ptr<Expression>>(
- // This produces "$control.max.a"
- ExpressionFieldPath::createPathFromString(
- pExpCtx.get(), maxPath, pExpCtx->variablesParseState))))),
- pExpCtx));
- }
- return std::make_unique<OrMatchExpression>(std::move(typeEqualityPredicates));
-}
-
-std::unique_ptr<MatchExpression> createComparisonPredicate(
- const ComparisonMatchExpressionBase* matchExpr,
- const BucketSpec& bucketSpec,
- int bucketMaxSpanSeconds,
- ExpressionContext::CollationMatchesDefault collationMatchesDefault,
- boost::intrusive_ptr<ExpressionContext> pExpCtx,
- bool assumeNoMixedSchemaData) {
- using namespace timeseries;
- const auto matchExprPath = matchExpr->path();
- const auto matchExprData = matchExpr->getData();
-
- // The control field's min and max are chosen using a field-order insensitive comparator, while
- // MatchExpressions use a comparator that treats field-order as significant. Because of this we
- // will not perform this optimization on queries with operands of compound types.
- if (matchExprData.type() == BSONType::Object || matchExprData.type() == BSONType::Array)
- return nullptr;
-
- // MatchExpressions have special comparison semantics regarding null, in that {$eq: null} will
- // match all documents where the field is either null or missing. Because this is different
- // from both the comparison semantics that InternalExprComparison expressions and the control's
- // min and max fields use, we will not perform this optimization on queries with null operands.
- if (matchExprData.type() == BSONType::jstNULL)
- return nullptr;
-
- // The control field's min and max are chosen based on the collation of the collection. If the
- // query's collation does not match the collection's collation and the query operand is a
- // string or compound type (skipped above) we will not perform this optimization.
- if (collationMatchesDefault == ExpressionContext::CollationMatchesDefault::kNo &&
- matchExprData.type() == BSONType::String) {
- return nullptr;
- }
-
- // We must avoid mapping predicates on the meta field onto the control field.
- if (bucketSpec.metaField() &&
- (matchExprPath == bucketSpec.metaField().get() ||
- expression::isPathPrefixOf(bucketSpec.metaField().get(), matchExprPath)))
- return nullptr;
-
- // We must avoid mapping predicates on fields computed via $addFields or a computed $project.
- if (fieldIsComputed(bucketSpec, matchExprPath.toString())) {
- return nullptr;
- }
-
- const auto isTimeField = (matchExprPath == bucketSpec.timeField());
- if (isTimeField && matchExprData.type() != BSONType::Date) {
- // Users are not allowed to insert non-date measurements into time field. So this query
- // would not match anything. We do not need to optimize for this case.
- return nullptr;
- }
-
- BSONObj minTime;
- BSONObj maxTime;
- if (isTimeField) {
- auto timeField = matchExprData.Date();
- minTime = BSON("" << timeField - Seconds(bucketMaxSpanSeconds));
- maxTime = BSON("" << timeField + Seconds(bucketMaxSpanSeconds));
- }
-
- auto minPath = std::string{kControlMinFieldNamePrefix} + matchExprPath;
- auto maxPath = std::string{kControlMaxFieldNamePrefix} + matchExprPath;
-
- switch (matchExpr->matchType()) {
- case MatchExpression::EQ:
- case MatchExpression::INTERNAL_EXPR_EQ:
- // For $eq, make both a $lte against 'control.min' and a $gte predicate against
- // 'control.max'.
- //
- // If the comparison is against the 'time' field, include a predicate against the _id
- // field which is converted to the maximum for the corresponding range of ObjectIds and
- // is adjusted by the max range for a bucket to approximate the max bucket value given
- // the min. Also include a predicate against the _id field which is converted to the
- // minimum for the range of ObjectIds corresponding to the given date. In
- // addition, we include a {'control.min' : {$gte: 'time - bucketMaxSpanSeconds'}} and
- // a {'control.max' : {$lte: 'time + bucketMaxSpanSeconds'}} predicate which will be
- // helpful in reducing bounds for index scans on 'time' field and routing on mongos.
- //
- // The same procedure applies to aggregation expressions of the form
- // {$expr: {$eq: [...]}} that can be rewritten to use $_internalExprEq.
- return isTimeField
- ? makePredicate(
- MatchExprPredicate<InternalExprLTEMatchExpression>(minPath, matchExprData),
- MatchExprPredicate<InternalExprGTEMatchExpression>(minPath,
- minTime.firstElement()),
- MatchExprPredicate<InternalExprGTEMatchExpression>(maxPath, matchExprData),
- MatchExprPredicate<InternalExprLTEMatchExpression>(maxPath,
- maxTime.firstElement()),
- MatchExprPredicate<LTEMatchExpression, Value>(
- kBucketIdFieldName,
- constructObjectIdValue<LTEMatchExpression>(matchExprData,
- bucketMaxSpanSeconds)),
- MatchExprPredicate<GTEMatchExpression, Value>(
- kBucketIdFieldName,
- constructObjectIdValue<GTEMatchExpression>(matchExprData,
- bucketMaxSpanSeconds)))
- : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
- makePredicate(MatchExprPredicate<InternalExprLTEMatchExpression>(
- minPath, matchExprData),
- MatchExprPredicate<InternalExprGTEMatchExpression>(
- maxPath, matchExprData)),
- createTypeEqualityPredicate(
- pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
-
- case MatchExpression::GT:
- case MatchExpression::INTERNAL_EXPR_GT:
- // For $gt, make a $gt predicate against 'control.max'. In addition, if the comparison
- // is against the 'time' field, include a predicate against the _id field which is
- // converted to the maximum for the corresponding range of ObjectIds and is adjusted
- // by the max range for a bucket to approximate the max bucket value given the min. In
- // addition, we include a {'control.min' : {$gt: 'time - bucketMaxSpanSeconds'}}
- // predicate which will be helpful in reducing bounds for index scans on 'time' field
- // and routing on mongos.
- //
- // The same procedure applies to aggregation expressions of the form
- // {$expr: {$gt: [...]}} that can be rewritten to use $_internalExprGt.
- return isTimeField
- ? makePredicate(
- MatchExprPredicate<InternalExprGTMatchExpression>(maxPath, matchExprData),
- MatchExprPredicate<InternalExprGTMatchExpression>(minPath,
- minTime.firstElement()),
- MatchExprPredicate<GTMatchExpression, Value>(
- kBucketIdFieldName,
- constructObjectIdValue<GTMatchExpression>(matchExprData,
- bucketMaxSpanSeconds)))
- : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
- std::make_unique<InternalExprGTMatchExpression>(maxPath, matchExprData),
- createTypeEqualityPredicate(
- pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
-
- case MatchExpression::GTE:
- case MatchExpression::INTERNAL_EXPR_GTE:
- // For $gte, make a $gte predicate against 'control.max'. In addition, if the comparison
- // is against the 'time' field, include a predicate against the _id field which is
- // converted to the minimum for the corresponding range of ObjectIds and is adjusted
- // by the max range for a bucket to approximate the max bucket value given the min. In
- // addition, we include a {'control.min' : {$gte: 'time - bucketMaxSpanSeconds'}}
- // predicate which will be helpful in reducing bounds for index scans on 'time' field
- // and routing on mongos.
- //
- // The same procedure applies to aggregation expressions of the form
- // {$expr: {$gte: [...]}} that can be rewritten to use $_internalExprGte.
- return isTimeField
- ? makePredicate(
- MatchExprPredicate<InternalExprGTEMatchExpression>(maxPath, matchExprData),
- MatchExprPredicate<InternalExprGTEMatchExpression>(minPath,
- minTime.firstElement()),
- MatchExprPredicate<GTEMatchExpression, Value>(
- kBucketIdFieldName,
- constructObjectIdValue<GTEMatchExpression>(matchExprData,
- bucketMaxSpanSeconds)))
- : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
- std::make_unique<InternalExprGTEMatchExpression>(maxPath, matchExprData),
- createTypeEqualityPredicate(
- pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
-
- case MatchExpression::LT:
- case MatchExpression::INTERNAL_EXPR_LT:
- // For $lt, make a $lt predicate against 'control.min'. In addition, if the comparison
- // is against the 'time' field, include a predicate against the _id field which is
- // converted to the minimum for the corresponding range of ObjectIds. In
- // addition, we include a {'control.max' : {$lt: 'time + bucketMaxSpanSeconds'}}
- // predicate which will be helpful in reducing bounds for index scans on 'time' field
- // and routing on mongos.
- //
- // The same procedure applies to aggregation expressions of the form
- // {$expr: {$lt: [...]}} that can be rewritten to use $_internalExprLt.
- return isTimeField
- ? makePredicate(
- MatchExprPredicate<InternalExprLTMatchExpression>(minPath, matchExprData),
- MatchExprPredicate<InternalExprLTMatchExpression>(maxPath,
- maxTime.firstElement()),
- MatchExprPredicate<LTMatchExpression, Value>(
- kBucketIdFieldName,
- constructObjectIdValue<LTMatchExpression>(matchExprData,
- bucketMaxSpanSeconds)))
- : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
- std::make_unique<InternalExprLTMatchExpression>(minPath, matchExprData),
- createTypeEqualityPredicate(
- pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
-
- case MatchExpression::LTE:
- case MatchExpression::INTERNAL_EXPR_LTE:
- // For $lte, make a $lte predicate against 'control.min'. In addition, if the comparison
- // is against the 'time' field, include a predicate against the _id field which is
- // converted to the maximum for the corresponding range of ObjectIds. In
- // addition, we include a {'control.max' : {$lte: 'time + bucketMaxSpanSeconds'}}
- // predicate which will be helpful in reducing bounds for index scans on 'time' field
- // and routing on mongos.
- //
- // The same procedure applies to aggregation expressions of the form
- // {$expr: {$lte: [...]}} that can be rewritten to use $_internalExprLte.
- return isTimeField
- ? makePredicate(
- MatchExprPredicate<InternalExprLTEMatchExpression>(minPath, matchExprData),
- MatchExprPredicate<InternalExprLTEMatchExpression>(maxPath,
- maxTime.firstElement()),
- MatchExprPredicate<LTEMatchExpression, Value>(
- kBucketIdFieldName,
- constructObjectIdValue<LTEMatchExpression>(matchExprData,
- bucketMaxSpanSeconds)))
- : std::make_unique<OrMatchExpression>(makeVector<std::unique_ptr<MatchExpression>>(
- std::make_unique<InternalExprLTEMatchExpression>(minPath, matchExprData),
- createTypeEqualityPredicate(
- pExpCtx, matchExprPath, assumeNoMixedSchemaData)));
-
- default:
- MONGO_UNREACHABLE_TASSERT(5348302);
- }
-
- MONGO_UNREACHABLE_TASSERT(5348303);
-}
-
std::unique_ptr<MatchExpression>
DocumentSourceInternalUnpackBucket::createPredicatesOnBucketLevelField(
const MatchExpression* matchExpr) const {
- if (matchExpr->matchType() == MatchExpression::AND) {
- auto nextAnd = static_cast<const AndMatchExpression*>(matchExpr);
- auto andMatchExpr = std::make_unique<AndMatchExpression>();
-
- for (size_t i = 0; i < nextAnd->numChildren(); i++) {
- if (auto child = createPredicatesOnBucketLevelField(nextAnd->getChild(i))) {
- andMatchExpr->add(std::move(child));
- }
- }
- if (andMatchExpr->numChildren() == 1) {
- return andMatchExpr->releaseChild(0);
- }
- if (andMatchExpr->numChildren() > 0) {
- return andMatchExpr;
- }
- } else if (ComparisonMatchExpression::isComparisonMatchExpression(matchExpr) ||
- ComparisonMatchExpressionBase::isInternalExprComparison(matchExpr->matchType())) {
- return createComparisonPredicate(
- static_cast<const ComparisonMatchExpressionBase*>(matchExpr),
- _bucketUnpacker.bucketSpec(),
- _bucketMaxSpanSeconds,
- pExpCtx->collationMatchesDefault,
- pExpCtx,
- _assumeNoMixedSchemaData);
- } else if (matchExpr->matchType() == MatchExpression::GEO) {
- auto& geoExpr = static_cast<const GeoMatchExpression*>(matchExpr)->getGeoExpression();
- if (geoExpr.getPred() == GeoExpression::WITHIN ||
- geoExpr.getPred() == GeoExpression::INTERSECT) {
- return std::make_unique<InternalBucketGeoWithinMatchExpression>(
- geoExpr.getGeometryPtr(), geoExpr.getField());
- }
- }
-
- return nullptr;
+ return BucketSpec::createPredicatesOnBucketLevelField(matchExpr,
+ _bucketUnpacker.bucketSpec(),
+ _bucketMaxSpanSeconds,
+ pExpCtx->collationMatchesDefault,
+ pExpCtx,
+ _assumeNoMixedSchemaData);
}
std::pair<boost::intrusive_ptr<DocumentSourceMatch>, boost::intrusive_ptr<DocumentSourceMatch>>
@@ -1014,8 +656,7 @@ Pipeline::SourceContainer::iterator DocumentSourceInternalUnpackBucket::doOptimi
// Some optimizations may not be safe to do if we have computed the metaField via an $addFields
// or a computed $project. We won't do those optimizations if 'haveComputedMetaField' is true.
bool haveComputedMetaField = _bucketUnpacker.bucketSpec().metaField() &&
- fieldIsComputed(_bucketUnpacker.bucketSpec(),
- _bucketUnpacker.bucketSpec().metaField().get());
+ _bucketUnpacker.bucketSpec().fieldIsComputed(_bucketUnpacker.bucketSpec().metaField().get());
// Before any other rewrites for the current stage, consider reordering with $sort.
if (auto sortPtr = dynamic_cast<DocumentSourceSort*>(std::next(itr)->get())) {
diff --git a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h
index c1b59078fae..0c5a17b4e23 100644
--- a/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h
+++ b/src/mongo/db/pipeline/document_source_internal_unpack_bucket.h
@@ -134,6 +134,12 @@ public:
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) const;
/**
+ * Convenience wrapper around BucketSpec::createPredicatesOnBucketLevelField().
+ */
+ std::unique_ptr<MatchExpression> createPredicatesOnBucketLevelField(
+ const MatchExpression* matchExpr) const;
+
+ /**
* Attempts to split 'match' into two stages, where the first is dependent only on the metaField
* and the second is the remainder, so that applying them in sequence is equivalent to applying
* 'match' once. Will return two intrusive_ptrs to new $match stages. Either pointer may be
@@ -144,25 +150,6 @@ public:
splitMatchOnMetaAndRename(boost::intrusive_ptr<DocumentSourceMatch> match);
/**
- * Takes a predicate after $_internalUnpackBucket on a bucketed field as an argument and
- * attempts to map it to a new predicate on the 'control' field. For example, the predicate
- * {a: {$gt: 5}} will generate the predicate {control.max.a: {$_internalExprGt: 5}}, which will
- * be added before the $_internalUnpackBucket stage.
- *
- * If the original predicate is on the bucket's timeField we may also create a new predicate
- * on the '_id' field to assist in index utilization. For example, the predicate
- * {time: {$lt: new Date(...)}} will generate the following predicate:
- * {$and: [
- * {_id: {$lt: ObjectId(...)}},
- * {control.min.time: {$_internalExprLt: new Date(...)}}
- * ]}
- *
- * If the provided predicate is ineligible for this mapping, the function will return a nullptr.
- */
- std::unique_ptr<MatchExpression> createPredicatesOnBucketLevelField(
- const MatchExpression* matchExpr) const;
-
- /**
* Sets the sample size to 'n' and the maximum number of measurements in a bucket to be
* 'bucketMaxCount'. Calling this method implicitly changes the behavior from having the stage
* unpack every bucket in a collection to sampling buckets to generate a uniform random sample