SERVER-73009 Compute clustered scan direction when plan is hinted or cached

author: Alyssa Wagenmaker <alyssa.wagenmaker@mongodb.com> 2023-02-01 19:08:22 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-02-02 18:29:15 +0000
commit: 55e4a05daabba24a6ec976a69335823384ed981b (patch)
tree: 54d5bf04939ae157164cc878ba67664fd876273b
parent: f5c68f8f3efb3281f2ffad462640c38268deb319 (diff)
download: mongo-55e4a05daabba24a6ec976a69335823384ed981b.tar.gz
3 files changed, 102 insertions, 64 deletions
diff --git a/jstests/noPassthrough/clustered_collection_sorted_scan.js b/jstests/noPassthrough/clustered_collection_sorted_scan.js
index 103f7094b9a..0888012917a 100644
--- a/jstests/noPassthrough/clustered_collection_sorted_scan.js
+++ b/jstests/noPassthrough/clustered_collection_sorted_scan.js
@@ -17,7 +17,7 @@ Random.setRandomSeed();
 const testConnection =
     MongoRunner.runMongod({setParameter: {supportArbitraryClusterKeyIndex: true}});
 const testDb = testConnection.getDB('local');
-const collectionSize = 200;
+const collectionSize = 10;
 const clusteredCollName = "clustered_index_sorted_scan_coll";
 const clusterField = "clusterKey";
 
@@ -40,20 +40,20 @@ for (let i = 0; i < collectionSize; ++i) {
     assert.commandWorked(nonClusteredColl.insert({[clusterField]: i, a: a}));
 }
 
-function runTest(isClustered, hasFilter, direction) {
+function runTest(isClustered, hasFilter, hasHint, direction) {
     let tsColl = isClustered ? clusteredColl : nonClusteredColl;
 
-    let query;
-    if (hasFilter) {
-        query = tsColl.find({[clusterField]: {$gt: -1}}).sort({[clusterField]: direction});
-    } else {
-        query = tsColl.find().sort({[clusterField]: direction});
-    }
+    const filter = hasFilter ? {[clusterField]: {$gt: -1}} : {};
+    const sort = {[clusterField]: direction};
+    const hint = hasHint ? {[clusterField]: 1} : {};
+
+    let query = tsColl.find(filter).sort(sort).hint(hint);
 
     function formatParamsAndPlan(plan) {
         let params = {
             isClustered: isClustered ? "true" : "false",
             hasFilter: hasFilter ? "true" : "false",
+            hasHint: hasHint ? "true" : "false",
             direction: direction ? "forward" : "backward",
         };
 
@@ -76,7 +76,7 @@ function runTest(isClustered, hasFilter, direction) {
     assert(!planHasStage(testDb, plan, "SORT"), "Unexpected sort in " + formatParamsAndPlan(plan));
 }
 
-function testCollations() {
+function testCollations(direction) {
     let strCollName = clusteredCollName + "_str";
 
     // Generate a clustered collection for the remainder of the testing
@@ -94,7 +94,7 @@ function testCollations() {
     // Because the collations don't match, we can't use the clustered index
     // to provide a sort
     let plan = tsColl.find()
-                   .sort({[clusterField]: 1})
+                   .sort({[clusterField]: direction})
                    .collation({locale: "fo", caseLevel: true})
                    .explain();
     assert(planHasStage(testDb, plan, "SORT"), "Expected sort in " + tojson(plan));
@@ -102,24 +102,26 @@ function testCollations() {
     // However, if we can exclude strings, we don't need an explicit sort even
     // if the collations don't match
     plan = tsColl.find({[clusterField]: {$gt: -1}})
-               .sort({[clusterField]: 1})
+               .sort({[clusterField]: direction})
                .collation({locale: "fo", caseLevel: true})
                .explain();
     assert(!planHasStage(testDb, plan, "SORT"), "Unxpected sort in " + tojson(plan));
     tsColl.drop();
 }
 
-function testMinMax(direction) {
+function testMinMax() {
+    // Min and max are only supported on forward collection scans.
+    const direction = 1;
     // Min and max should be between 0 and collection size
-    const minResult = 10;  // inclusive
-    const maxResult = 20;  // not inclusive
+    const minResult = 5;  // inclusive
+    const maxResult = 8;  // not inclusive
     const resultCount = maxResult - minResult;
 
     let normalCursor = nonClusteredColl.find()
                            .hint({[clusterField]: 1})
                            .min({[clusterField]: minResult})
                            .max({[clusterField]: maxResult})
-                           .sort({[clusterField]: 1});
+                           .sort({[clusterField]: direction});
     let normalResult = normalCursor.toArray();
     assert.eq(normalResult.length,
               resultCount,
@@ -129,7 +131,7 @@ function testMinMax(direction) {
                             .hint({[clusterField]: 1})
                             .min({[clusterField]: minResult})
                             .max({[clusterField]: maxResult})
-                            .sort({[clusterField]: 1});
+                            .sort({[clusterField]: direction});
     let clusterResult = clusterCursor.toArray();
     assert.eq(clusterResult.length,
               resultCount,
@@ -141,24 +143,38 @@ function testMinMax(direction) {
 }
 
 // Ensure that the plan gets cached correctly
-function testPlanCache() {
+function testPlanCache(direction) {
     clusteredColl.getPlanCache().clear();
 
     const indexName = "_a";
     assert.commandWorked(clusteredColl.createIndex({a: 1}, {name: indexName}));
 
+    const filter = {a: {$gt: -1}};
+    const projection = {_id: 0, [clusterField]: 1};
+    const sort = {[clusterField]: direction};
+
     // Because of the _a index above, we should have two alternatves -- filter via the
     // index then a blocking sort, or filter during a collection scan. Because of the blocking
     // sort and the fact that "a" doesnt actually filter anything, we expect the
     // collection scan to win.
-    let query = clusteredColl.find({a: {$gt: -1}}).sort({[clusterField]: -1});
-
-    let plan = query.explain();
+    let plan = clusteredColl.find(filter, projection).sort(sort).explain();
     assert(plan.queryPlanner.rejectedPlans.length > 0, tojson(plan));
     assert(planHasStage(testDb, plan, "COLLSCAN"), "Expected COLLSCAN in " + tojson(plan));
 
-    // Run the query to ensure it's cached
-    query.toArray();
+    let nonClusteredResults = nonClusteredColl.find(filter, projection).sort(sort).toArray();
+    assert.eq(nonClusteredResults.length, collectionSize);
+
+    // Now run the query and verify that the results are expected. Run it a few times so that the
+    // cached plan will be used.
+    assert.eq(nonClusteredResults,
+              clusteredColl.find(filter, projection).sort(sort).toArray(),
+              tojson(plan));
+    assert.eq(nonClusteredResults,
+              clusteredColl.find(filter, projection).sort(sort).toArray(),
+              tojson(plan));
+    assert.eq(nonClusteredResults,
+              clusteredColl.find(filter, projection).sort(sort).toArray(),
+              tojson(plan));
 
     // Verify that there's a cache entry for this query
     let cacheEntries = clusteredColl.getPlanCache().list();
@@ -171,16 +187,20 @@ function testPlanCache() {
 // Actually run all the tests:
 for (let isClustered = 0; isClustered <= 1; isClustered++) {
     for (let hasFilter = 0; hasFilter <= 1; hasFilter++) {
-        runTest(isClustered, hasFilter, /* direction = */ 1);
-        runTest(isClustered, hasFilter, /* direction = */ -1);
+        for (let hasHint = 0; hasHint <= 1; hasHint++) {
+            runTest(isClustered, hasFilter, hasHint, /* direction = */ 1);
+            runTest(isClustered, hasFilter, hasHint, /* direction = */ -1);
+        }
     }
 }
 
-testCollations();
-testMinMax(/* direction = */ 1);
-testMinMax(/* direction = */ -1);
+testCollations(/* direction = */ 1);
+testCollations(/* direction = */ -1);
+
+testMinMax();
 
-testPlanCache();
+testPlanCache(/* direction = */ 1);
+testPlanCache(/* direction = */ -1);
 
 // If we're sorting on multiple columns, we still need an explicit sort
 let plan = clusteredColl.find().sort({[clusterField]: 1, a: 1}).explain();
diff --git a/src/mongo/db/query/planner_access.h b/src/mongo/db/query/planner_access.h
index bd9c856ffab..03e9773a057 100644
--- a/src/mongo/db/query/planner_access.h
+++ b/src/mongo/db/query/planner_access.h
@@ -106,7 +106,7 @@ public:
     static std::unique_ptr<QuerySolutionNode> makeCollectionScan(const CanonicalQuery& query,
                                                                  bool tailable,
                                                                  const QueryPlannerParams& params,
-                                                                 int direction = 1);
+                                                                 int direction);
 
     /**
      * Return a plan that uses the provided index as a proxy for a collection scan.
diff --git a/src/mongo/db/query/query_planner.cpp b/src/mongo/db/query/query_planner.cpp
index c258c9e6867..10a5f86b464 100644
--- a/src/mongo/db/query/query_planner.cpp
+++ b/src/mongo/db/query/query_planner.cpp
@@ -469,12 +469,46 @@ static BSONObj finishMaxObj(const IndexEntry& indexEntry,
     }
 }
 
+bool providesSort(const CanonicalQuery& query, const BSONObj& kp) {
+    return query.getFindCommandRequest().getSort().isPrefixOf(
+        kp, SimpleBSONElementComparator::kInstance);
+}
+
+/**
+ * Determine whether this query has a sort that can be provided by the clustered index, if so, which
+ * direction the scan should be. If the collection is not clustered, or the sort cannot be provided,
+ * returns 'boost::none'.
+ */
+boost::optional<int> determineClusteredScanDirection(const CanonicalQuery& query,
+                                                     const QueryPlannerParams& params) {
+    if (params.clusteredInfo && query.getSortPattern() &&
+        CollatorInterface::collatorsMatch(params.clusteredCollectionCollator,
+                                          query.getCollator())) {
+        auto kp = clustered_util::getSortPattern(params.clusteredInfo->getIndexSpec());
+        if (providesSort(query, kp)) {
+            return 1;
+        } else if (providesSort(query, QueryPlannerCommon::reverseSortObj(kp))) {
+            return -1;
+        }
+    }
+
+    return boost::none;
+}
+
+/**
+ * Determine the direction of the scan needed for the query. Defaults to 1 unless this is a
+ * clustered collection and we have a sort that can be provided by the clustered index.
+ */
+int determineCollscanDirection(const CanonicalQuery& query, const QueryPlannerParams& params) {
+    return determineClusteredScanDirection(query, params).value_or(1);
+}
+
 std::unique_ptr<QuerySolution> buildCollscanSoln(const CanonicalQuery& query,
                                                  bool tailable,
                                                  const QueryPlannerParams& params,
-                                                 int direction = 1) {
-    std::unique_ptr<QuerySolutionNode> solnRoot(
-        QueryPlannerAccess::makeCollectionScan(query, tailable, params, direction));
+                                                 boost::optional<int> direction = boost::none) {
+    std::unique_ptr<QuerySolutionNode> solnRoot(QueryPlannerAccess::makeCollectionScan(
+        query, tailable, params, direction.value_or(determineCollscanDirection(query, params))));
     return QueryPlannerAnalysis::analyzeDataAccess(query, params, std::move(solnRoot));
 }
 
@@ -491,11 +525,6 @@ std::unique_ptr<QuerySolution> buildWholeIXSoln(
     return QueryPlannerAnalysis::analyzeDataAccess(query, params, std::move(solnRoot));
 }
 
-bool providesSort(const CanonicalQuery& query, const BSONObj& kp) {
-    return query.getFindCommandRequest().getSort().isPrefixOf(
-        kp, SimpleBSONElementComparator::kInstance);
-}
-
 StatusWith<std::unique_ptr<PlanCacheIndexTree>> QueryPlanner::cacheDataFromTaggedTree(
     const MatchExpression* const taggedTree, const vector<IndexEntry>& relevantIndices) {
     if (!taggedTree) {
@@ -661,7 +690,7 @@ StatusWith<std::unique_ptr<QuerySolution>> QueryPlanner::planFromCache(
     } else if (SolutionCacheData::COLLSCAN_SOLN == winnerCacheData.solnType) {
         // The cached solution is a collection scan. We don't cache collscans
         // with tailable==true, hence the false below.
-        auto soln = buildCollscanSoln(query, false, params);
+        auto soln = buildCollscanSoln(query, false, params, winnerCacheData.wholeIXSolnDir);
         if (!soln) {
             return Status(ErrorCodes::NoQueryExecutionPlans,
                           "plan cache error: collection scan soln");
@@ -1260,32 +1289,19 @@ StatusWith<std::vector<std::unique_ptr<QuerySolution>>> QueryPlanner::plan(
 
         // The base index is sorted on some key, so it's possible we might want to use
         // a collection scan to provide the sort requested
-        if (params.clusteredInfo) {
-            if (CollatorInterface::collatorsMatch(params.clusteredCollectionCollator,
-                                                  query.getCollator())) {
-                auto kp = clustered_util::getSortPattern(params.clusteredInfo->getIndexSpec());
-                int direction = 0;
-                if (providesSort(query, kp)) {
-                    direction = 1;
-                } else if (providesSort(query, QueryPlannerCommon::reverseSortObj(kp))) {
-                    direction = -1;
-                }
-
-                if (direction != 0) {
-                    auto soln = buildCollscanSoln(query, isTailable, params, direction);
-                    if (soln) {
-                        LOGV2_DEBUG(6082401,
-                                    5,
-                                    "Planner: outputting soln that uses clustered index to "
-                                    "provide sort");
-                        SolutionCacheData* scd = new SolutionCacheData();
-                        scd->solnType = SolutionCacheData::COLLSCAN_SOLN;
-                        scd->wholeIXSolnDir = direction;
+        if (auto direction = determineClusteredScanDirection(query, params)) {
+            auto soln = buildCollscanSoln(query, isTailable, params, direction);
+            if (soln) {
+                LOGV2_DEBUG(6082401,
+                            5,
+                            "Planner: outputting soln that uses clustered index to "
+                            "provide sort");
+                SolutionCacheData* scd = new SolutionCacheData();
+                scd->solnType = SolutionCacheData::COLLSCAN_SOLN;
+                scd->wholeIXSolnDir = *direction;
 
-                        soln->cacheData.reset(scd);
-                        out.push_back(std::move(soln));
-                    }
-                }
+                soln->cacheData.reset(scd);
+                out.push_back(std::move(soln));
             }
         }
     }
@@ -1349,7 +1365,8 @@ StatusWith<std::vector<std::unique_ptr<QuerySolution>>> QueryPlanner::plan(
     }
 
     if (possibleToCollscan && (collscanRequested || collScanRequired)) {
-        auto collscan = buildCollscanSoln(query, isTailable, params);
+        auto direction = determineCollscanDirection(query, params);
+        auto collscan = buildCollscanSoln(query, isTailable, params, direction);
         if (!collscan && collScanRequired) {
             return Status(ErrorCodes::NoQueryExecutionPlans,
                           "Failed to build collection scan soln");
@@ -1361,6 +1378,7 @@ StatusWith<std::vector<std::unique_ptr<QuerySolution>>> QueryPlanner::plan(
                         "collectionScan"_attr = redact(collscan->toString()));
             SolutionCacheData* scd = new SolutionCacheData();
             scd->solnType = SolutionCacheData::COLLSCAN_SOLN;
+            scd->wholeIXSolnDir = direction;
             collscan->cacheData.reset(scd);
             out.push_back(std::move(collscan));
         }
author	Alyssa Wagenmaker <alyssa.wagenmaker@mongodb.com>	2023-02-01 19:08:22 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-02-02 18:29:15 +0000
commit	55e4a05daabba24a6ec976a69335823384ed981b (patch)
tree	54d5bf04939ae157164cc878ba67664fd876273b
parent	f5c68f8f3efb3281f2ffad462640c38268deb319 (diff)
download	mongo-55e4a05daabba24a6ec976a69335823384ed981b.tar.gz