SERVER-54128 Don't push down expressions past sort+limit

author: David Percy <david.percy@mongodb.com> 2021-01-29 01:12:21 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-06-23 15:52:33 +0000
commit: 93e25299fbf8e0279cd855f4ce19becae230d2a1 (patch)
tree: 31d6b248add5e63f96dce60abc1f0479cf14ef82
parent: 2950f483e8cfa0970d18ae873c3f0461f033fe31 (diff)
download: mongo-93e25299fbf8e0279cd855f4ce19becae230d2a1.tar.gz
4 files changed, 185 insertions, 44 deletions
diff --git a/jstests/aggregation/bugs/expression_swap_limit.js b/jstests/aggregation/bugs/expression_swap_limit.js
new file mode 100644
index 00000000000..eef11dea51d
--- /dev/null
+++ b/jstests/aggregation/bugs/expression_swap_limit.js
@@ -0,0 +1,82 @@
+/**
+ * Pushing down computed projection before sort+limit can cause the projection to be evaluated on
+ * invalid documents and throw exceptions. This test checks that we do not push computed projection
+ * before sort+limit, preventing valid queries from failing.
+ *
+ * This test was intended to reproduce a bug, SERVER-54128.
+ */
+(function() {
+
+const coll = db.expression_swap_limit;
+coll.drop();
+
+const NUM_INVALID_DOCS = 10;
+
+const docs = Array.from({length: NUM_INVALID_DOCS}, (_, i) => ({_id: i, a: ""}));
+docs.push({_id: 99, a: 123});
+coll.insert(docs);
+
+coll.createIndex({some_other_field: 1});
+
+const predicate = {
+    $or: [
+        // One branch of the $or needs to have more than one relevant index.
+        {_id: {$lt: 9999}, some_other_field: {$ne: 3}},
+        // The other branch doesn't matter: it's only here to prevent the $or being
+        // optimized out.
+        {this_predicate_matches_nothing: true},
+    ]
+};
+const sortSpec = {
+    _id: -1
+};
+const oppositeSortSpec = {
+    _id: +1
+};
+const projection = {
+    _id: 1,
+    b: {$round: "$a"},
+};
+const pipeline1 = [
+    // We need a rooted $or to trigger the SubplanStage.
+    // Instead of one MultiPlanStage for the whole query, we get one
+    // per branch of the $or.
+    {$match: predicate},
+    // Next we need a $sort+$limit. These two stages unambiguously select _id: 99.
+    // From the user's point of view, the $limit returns a bag of 1 document.
+    {$sort: sortSpec},
+    {$limit: 1},
+    // This projection raises an error if we evaluate it on any document other than
+    // _id: 99, because that is the only document where 'a' is numeric.
+    {$project: projection},
+];
+{
+    // The pipeline should succeed.
+    const aggResult = coll.aggregate(pipeline1).toArray();
+    assert.docEq(aggResult, [{_id: 99, b: 123}]);
+
+    // The pipeline should succeed without pushing down to find.
+    const noOptResult =
+        coll.aggregate([{$_internalInhibitOptimization: {}}].concat(pipeline1)).toArray();
+    assert.docEq(noOptResult, [{_id: 99, b: 123}]);
+}
+
+// Similarly, we can select the 1 valid document by flipping the sort and skipping
+// all but one document.
+const pipeline2 = [
+    {$match: predicate},
+    {$sort: oppositeSortSpec},
+    {$skip: NUM_INVALID_DOCS},
+    {$project: projection},
+];
+{
+    // The pipeline should succeed.
+    const aggResult = coll.aggregate(pipeline2).toArray();
+    assert.docEq(aggResult, [{_id: 99, b: 123}]);
+
+    // The pipeline should succeed without pushing down to find.
+    const noOptResult =
+        coll.aggregate([{$_internalInhibitOptimization: {}}].concat(pipeline2)).toArray();
+    assert.docEq(noOptResult, [{_id: 99, b: 123}]);
+}
+})();
diff --git a/jstests/aggregation/bugs/server6192_server6193.js b/jstests/aggregation/bugs/server6192_server6193.js
index 7a395f28262..30103dfd0a8 100644
--- a/jstests/aggregation/bugs/server6192_server6193.js
+++ b/jstests/aggregation/bugs/server6192_server6193.js
@@ -20,41 +20,56 @@ const t = db.jstests_aggregation_server6192;
 t.drop();
 assert.commandWorked(t.insert({x: true}));
 
-function assertOptimized(pipeline, v) {
-    const explained = t.runCommand("aggregate", {pipeline: pipeline, explain: true});
-    const projectStage = getPlanStage(explained, "PROJECTION_DEFAULT");
-    assert.eq(projectStage.transformBy.a["$const"], v, "ensure short-circuiting worked", explained);
+function optimize(expression) {
+    const explained = t.explain().aggregate([
+        // This inhibit optimization prevents the expression being pushed down into the .find()
+        // layer, to make assertions simpler. But it doesn't prevent the $project stage itself
+        // from being optimized.
+        {$_internalInhibitOptimization: {}},
+        {$project: {result: expression}},
+    ]);
+
+    const stage = getAggPlanStage(explained, "$project");
+    assert(stage, explained);
+    assert(stage.$project.result, explained);
+    return stage.$project.result;
+}
+
+function assertOptimized(expression, value) {
+    const optimized = optimize(expression);
+    assert.docEq(optimized, {$const: value}, "ensure short-circuiting worked", optimized);
 }
 
-function assertNotOptimized(pipeline) {
-    const explained = t.runCommand("aggregate", {pipeline: pipeline, explain: true});
-    const projectStage = getPlanStage(explained, "PROJECTION_DEFAULT");
-    assert(!("$const" in projectStage.transformBy.a), "ensure no short-circuiting");
+function assertNotOptimized(expression) {
+    const optimized = optimize(expression);
+    // 'optimized' may be simpler than 'expression', but we assert it did not optimize to a
+    // constant.
+    assert.neq(Object.keys(optimized), ['$const'], "ensure no short-circuiting", optimized);
 }
 
 // short-circuiting for $and
-assertOptimized([{$project: {a: {$and: [0, '$x']}}}], false);
-assertOptimized([{$project: {a: {$and: [0, 1, '$x']}}}], false);
-assertOptimized([{$project: {a: {$and: [0, 1, '', '$x']}}}], false);
+assertOptimized({$and: [0, '$x']}, false);
+assertOptimized({$and: [0, 1, '$x']}, false);
+assertOptimized({$and: [0, 1, '', '$x']}, false);
 
-assertOptimized([{$project: {a: {$and: [1, 0, '$x']}}}], false);
-assertOptimized([{$project: {a: {$and: [1, '', 0, '$x']}}}], false);
-assertOptimized([{$project: {a: {$and: [1, 1, 0, 1]}}}], false);
+assertOptimized({$and: [1, 0, '$x']}, false);
+assertOptimized({$and: [1, '', 0, '$x']}, false);
+assertOptimized({$and: [1, 1, 0, 1]}, false);
 
 // short-circuiting for $or
-assertOptimized([{$project: {a: {$or: [1, '$x']}}}], true);
-assertOptimized([{$project: {a: {$or: [1, 0, '$x']}}}], true);
-assertOptimized([{$project: {a: {$or: [1, '', '$x']}}}], true);
+assertOptimized({$or: [1, '$x']}, true);
+assertOptimized({$or: [1, 0, '$x']}, true);
+assertOptimized({$or: [1, '', '$x']}, true);
 
-assertOptimized([{$project: {a: {$or: [0, 1, '$x']}}}], true);
-assertOptimized([{$project: {a: {$or: ['', 0, 1, '$x']}}}], true);
-assertOptimized([{$project: {a: {$or: [0, 0, 0, 1]}}}], true);
+assertOptimized({$or: [0, 1, '$x']}, true);
+assertOptimized({$or: ['', 0, 1, '$x']}, true);
+assertOptimized({$or: [0, 0, 0, 1]}, true);
 
 // examples that should not short-circuit
-assertNotOptimized([{$project: {a: {$and: [1, '$x']}}}]);
-assertNotOptimized([{$project: {a: {$or: [0, '$x']}}}]);
-assertNotOptimized([{$project: {a: {$and: ['$x', '$x']}}}]);
-assertNotOptimized([{$project: {a: {$or: ['$x', '$x']}}}]);
-assertNotOptimized([{$project: {a: {$and: ['$x']}}}]);
-assertNotOptimized([{$project: {a: {$or: ['$x']}}}]);
+assertNotOptimized({$and: [1, '$x']});
+assertNotOptimized({$or: [0, '$x']});
+assertNotOptimized({$and: ['$x', '$x']});
+assertNotOptimized({$or: ['$x', '$x']});
+assertNotOptimized({$and: ['$x']});
+assertNotOptimized({$or: ['$x']});
 }());
diff --git a/jstests/aggregation/optimize_away_pipeline.js b/jstests/aggregation/optimize_away_pipeline.js
index 887923b13ce..c783698b1fe 100644
--- a/jstests/aggregation/optimize_away_pipeline.js
+++ b/jstests/aggregation/optimize_away_pipeline.js
@@ -17,7 +17,6 @@
 (function() {
 "use strict";
 
-load("jstests/aggregation/extras/utils.js");  // For 'orderedArrayEq' and 'arrayEq'.
 load("jstests/concurrency/fsm_workload_helpers/server_types.js");  // For isWiredTiger.
 load("jstests/libs/analyze_plan.js");     // For 'aggPlanHasStage' and other explain helpers.
 load("jstests/libs/fixture_helpers.js");  // For 'isMongos' and 'isSharded'.
@@ -75,8 +74,11 @@ function assertPipelineUsesAggregation({
 
     if (expectedResult) {
         const actualResult = coll.aggregate(pipeline, pipelineOptions).toArray();
-        assert(preserveResultOrder ? orderedArrayEq(actualResult, expectedResult)
-                                   : arrayEq(actualResult, expectedResult));
+        if (preserveResultOrder) {
+            assert.docEq(actualResult, expectedResult);
+        } else {
+            assert.sameMembers(actualResult, expectedResult);
+        }
     }
 
     return explainOutput;
@@ -116,8 +118,11 @@ function assertPipelineDoesNotUseAggregation({
 
     if (expectedResult) {
         const actualResult = coll.aggregate(pipeline, pipelineOptions).toArray();
-        assert(preserveResultOrder ? orderedArrayEq(actualResult, expectedResult)
-                                   : arrayEq(actualResult, expectedResult));
+        if (preserveResultOrder) {
+            assert.docEq(actualResult, expectedResult);
+        } else {
+            assert.sameMembers(actualResult, expectedResult);
+        }
     }
 
     return explainOutput;
@@ -128,7 +133,7 @@ function testGetMore({command = null, expectedResult = null} = {}) {
     const documents =
         new DBCommandCursor(db, assert.commandWorked(db.runCommand(command)), 1 /* batchsize */)
             .toArray();
-    assert(arrayEq(documents, expectedResult));
+    assert.sameMembers(documents, expectedResult);
 }
 
 let explainOutput;
@@ -180,6 +185,19 @@ assertPipelineDoesNotUseAggregation({
     expectedStages: ["IXSCAN"],
     expectedResult: [{x: 20}]
 });
+// However, when the $project is computed, pushing it down into the find() layer would sometimes
+// have the effect of reordering it before the $sort and $limit. This can cause a valid query to
+// throw an error, as in SERVER-54128.
+assertPipelineUsesAggregation({
+    pipeline: [
+        {$match: {x: {$gte: 20}}},
+        {$sort: {x: 1}},
+        {$limit: 1},
+        {$project: {x: {$substr: ["$y", 0, 1]}, _id: 0}}
+    ],
+    expectedStages: ["IXSCAN"],
+    expectedResult: [{x: ""}]
+});
 assert.commandWorked(coll.dropIndexes());
 
 assert.commandWorked(coll.insert({_id: 4, x: 40, a: {b: "ab1"}}));
@@ -571,10 +589,9 @@ if (!FixtureHelpers.isMongos(db) && isWiredTiger(db)) {
     });
     db.setProfilingLevel(0);
     let profile = db.system.profile.find({}, {op: 1, ns: 1}).sort({ts: 1}).toArray();
-    assert(
-        arrayEq(profile,
-                [{op: "command", ns: coll.getFullName()}, {op: "getmore", ns: coll.getFullName()}]),
-        profile);
+    assert.sameMembers(
+        profile,
+        [{op: "command", ns: coll.getFullName()}, {op: "getmore", ns: coll.getFullName()}]);
     // Test getMore puts a correct namespace into profile data for a view with an optimized away
     // pipeline.
     if (!FixtureHelpers.isSharded(coll)) {
@@ -591,9 +608,9 @@ if (!FixtureHelpers.isMongos(db) && isWiredTiger(db)) {
         });
         db.setProfilingLevel(0);
         profile = db.system.profile.find({}, {op: 1, ns: 1}).sort({ts: 1}).toArray();
-        assert(arrayEq(
+        assert.sameMembers(
             profile,
-            [{op: "query", ns: view.getFullName()}, {op: "getmore", ns: view.getFullName()}]));
+            [{op: "query", ns: view.getFullName()}, {op: "getmore", ns: view.getFullName()}]);
     }
 }
 }());
diff --git a/src/mongo/db/pipeline/pipeline_d.cpp b/src/mongo/db/pipeline/pipeline_d.cpp
index 26382290319..afbe455fee4 100644
--- a/src/mongo/db/pipeline/pipeline_d.cpp
+++ b/src/mongo/db/pipeline/pipeline_d.cpp
@@ -27,6 +27,7 @@
  *    it in the license file.
  */
 
+#include "mongo/db/query/projection_parser.h"
 #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQuery
 
 #include "mongo/platform/basic.h"
@@ -650,11 +651,18 @@ SkipThenLimit extractSkipAndLimitForPushdown(Pipeline* pipeline) {
  *    2. If there is no inclusion projection at the front of the pipeline, but there is a finite
  *       dependency set, a projection representing this dependency set will be pushed down.
  *    3. Otherwise, an empty projection is returned and no projection push down will happen.
+ *
+ * If 'allowExpressions' is true, the returned projection may include expressions (which can only
+ * happen in case 1). If 'allowExpressions' is false and the projection we find has expressions,
+ * then we fall through to case 2 and attempt to push down a pure-inclusion projection based on its
+ * dependencies.
  */
-auto buildProjectionForPushdown(const DepsTracker& deps, Pipeline* pipeline) {
+auto buildProjectionForPushdown(const DepsTracker& deps,
+                                Pipeline* pipeline,
+                                bool allowExpressions) {
     auto&& sources = pipeline->getSources();
 
-    // Short-circuit if the pipeline is emtpy, there is no projection and nothing to push down.
+    // Short-circuit if the pipeline is empty: there is no projection and nothing to push down.
     if (sources.empty()) {
         return BSONObj();
     }
@@ -663,17 +671,24 @@ auto buildProjectionForPushdown(const DepsTracker& deps, Pipeline* pipeline) {
             exact_pointer_cast<DocumentSourceSingleDocumentTransformation*>(sources.front().get());
         projStage) {
         if (projStage->getType() == TransformerInterface::TransformerType::kInclusionProjection) {
-            // If there is an inclusion projection at the front of the pipeline, we have case 1.
             auto projObj =
                 projStage->getTransformer().serializeTransformation(boost::none).toBson();
-            sources.pop_front();
-            return projObj;
+            auto projAst = projection_ast::parse(projStage->getContext(),
+                                                 projObj,
+                                                 ProjectionPolicies::aggregateProjectionPolicies());
+            if (!projAst.hasExpressions() || allowExpressions) {
+                // If there is an inclusion projection at the front of the pipeline, we have case 1.
+                sources.pop_front();
+                return projObj;
+            }
         }
     }
 
     // Depending of whether there is a finite dependency set, either return a projection
     // representing this dependency set, or an empty BSON, meaning no projection push down will
     // happen. This covers cases 2 and 3.
+    if (deps.getNeedsAnyMetadata())
+        return BSONObj();
     return deps.toProjectionWithoutMetadata();
 }
 }  // namespace
@@ -902,7 +917,19 @@ StatusWith<std::unique_ptr<PlanExecutor, PlanExecutor::Deleter>> PipelineD::prep
         // Build a BSONObj representing a projection eligible for pushdown. If there is an inclusion
         // projection at the front of the pipeline, it will be removed and handled by the PlanStage
         // layer. If a projection cannot be pushed down, an empty BSONObj will be returned.
-        projObj = buildProjectionForPushdown(deps, pipeline);
+
+        // In most cases .find() behaves as if it evaluates in a predictable order:
+        //     predicate, sort, skip, limit, projection.
+        // But there is at least one case where it runs the projection before the sort/skip/limit:
+        // when the predicate has a rooted $or.  (In that case we plan each branch of the $or
+        // separately, using Subplan, and include the projection on each branch.)
+
+        // To work around this behavior, don't allow pushing down expressions if we are also going
+        // to push down a sort, skip or limit. We don't want the expressions to be evaluated on any
+        // documents that the sort/skip/limit would have filtered out. (The sort stage can be a
+        // top-k sort, which both sorts and limits.)
+        bool allowExpressions = !sortStage && !skipThenLimit.getSkip() && !skipThenLimit.getLimit();
+        projObj = buildProjectionForPushdown(deps, pipeline, allowExpressions);
         plannerOpts |= QueryPlannerParams::RETURN_OWNED_DATA;
     }
author	David Percy <david.percy@mongodb.com>	2021-01-29 01:12:21 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-06-23 15:52:33 +0000
commit	93e25299fbf8e0279cd855f4ce19becae230d2a1 (patch)
tree	31d6b248add5e63f96dce60abc1f0479cf14ef82
parent	2950f483e8cfa0970d18ae873c3f0461f033fe31 (diff)
download	mongo-93e25299fbf8e0279cd855f4ce19becae230d2a1.tar.gz