SERVER-14311 account for key skips in plan ranking by returning NEED_TIME in the index scan stage

author: David Storch <david.storch@10gen.com> 2014-07-02 13:13:54 -0400
committer: David Storch <david.storch@10gen.com> 2014-07-11 13:22:16 -0400
commit: f3ba3590ce1ca1bf8f7fbb13ad8311a52df9c176 (patch)
tree: c81fe778b2e2604ae97b83d22552d19c04b45691 /src
parent: 9dbdefb125b07db9d9801b80d525f9a954bec5f2 (diff)
download: mongo-f3ba3590ce1ca1bf8f7fbb13ad8311a52df9c176.tar.gz
4 files changed, 67 insertions, 3 deletions
diff --git a/src/mongo/db/exec/index_scan.cpp b/src/mongo/db/exec/index_scan.cpp
index 30aff504f37..5e0bd22b899 100644
--- a/src/mongo/db/exec/index_scan.cpp
+++ b/src/mongo/db/exec/index_scan.cpp
@@ -56,6 +56,7 @@ namespace mongo {
                          WorkingSet* workingSet,
                          const MatchExpression* filter)
         : _txn(txn),
+          _checkEndKeys(0),
           _workingSet(workingSet),
           _hitEnd(false),
           _filter(filter), 
@@ -138,10 +139,21 @@ namespace mongo {
         // Adds the amount of time taken by work() to executionTimeMillis.
         ScopedTimer timer(&_commonStats.executionTimeMillis);
 
+        // If we examined multiple keys in a prior work cycle, make up for it here by returning
+        // NEED_TIME. This is done for plan ranking. Refer to the comment for '_checkEndKeys'
+        // in the .h for details.
+        if (_checkEndKeys > 0) {
+            --_checkEndKeys;
+            ++_commonStats.needTime;
+            return PlanStage::NEED_TIME;
+        }
+
         if (NULL == _indexCursor.get()) {
             // First call to work().  Perform possibly heavy init.
             initIndexScan();
             checkEnd();
+            ++_commonStats.needTime;
+            return PlanStage::NEED_TIME;
         }
         else if (_yieldMovedCursor) {
             _yieldMovedCursor = false;
@@ -218,6 +230,10 @@ namespace mongo {
             }
         }
 
+        if (_checkEndKeys != 0) {
+            return false;
+        }
+
         return _hitEnd || _indexCursor->isEOF();
     }
 
@@ -340,8 +356,7 @@ namespace mongo {
                     break;
                 }
 
-                // TODO: Can we do too much scanning here?  Old BtreeCursor stops scanning after a
-                // while and relies on a Matcher to make sure the result is ok.
+                ++_checkEndKeys;
             }
         }
     }
diff --git a/src/mongo/db/exec/index_scan.h b/src/mongo/db/exec/index_scan.h
index 7df62304eef..50e2336a9f9 100644
--- a/src/mongo/db/exec/index_scan.h
+++ b/src/mongo/db/exec/index_scan.h
@@ -117,6 +117,21 @@ namespace mongo {
         // transactional context for read locks. Not owned by us
         OperationContext* _txn;
 
+        // The number of keys examined during a call to checkEnd() that have not yet been
+        // accounted for by returning a NEED_TIME.
+        //
+        // Good plan ranking requires that the index scan uses one work cycle per index key
+        // examined. Since checkEnd() may examine multiple keys, we keep track of them here
+        // and make up for it later by returning NEED_TIME.
+        //
+        // Example of how this is useful for plan ranking:
+        //   Say you have indices {a: 1, b: 1} and {a: 1, x: 1, b: 1}, with predicates over
+        //   fields 'a' and 'b'. It's cheaper to use index {a: 1, b: 1}. Why? Because for
+        //   index {a: 1, x: 1, b: 1} you have to skip lots of keys due to the interceding
+        //   'x' field. This skipping is done inside checkEnd(), and we use '_checkEndKeys'
+        //   to account for it.
+        size_t _checkEndKeys;
+
         // The WorkingSet we annotate with results.  Not owned by us.
         WorkingSet* _workingSet;
 
diff --git a/src/mongo/dbtests/plan_ranking.cpp b/src/mongo/dbtests/plan_ranking.cpp
index 4908eb180ef..29a01c306c8 100644
--- a/src/mongo/dbtests/plan_ranking.cpp
+++ b/src/mongo/dbtests/plan_ranking.cpp
@@ -723,6 +723,37 @@ namespace PlanRankingTests {
         }
     };
 
+    /**
+     * Suppose we have two plans which are roughly equivalent, other than that
+     * one uses an index which involves doing a lot more skipping of index keys.
+     * Prefer the plan which does not have to do this index key skipping.
+     */
+    class PlanRankingAccountForKeySkips : public PlanRankingTestBase {
+    public:
+        void run() {
+            for (int i = 0; i < 4; ++i) {
+                insert(BSON("a" << 1 << "b" << 1));
+            }
+
+            // Second index has intervening fields which leads to more
+            // skipping of keys. We should pick a plan that uses the first.
+            addIndex(BSON("a" << 1 << "b" << 1));
+            addIndex(BSON("a" << 1 << "x" << 1 << "y" << 1 << "z" << 1 << "b" << 1));
+
+            CanonicalQuery* cq;
+            ASSERT(CanonicalQuery::canonicalize(ns,
+                                                fromjson("{a: 1, b: 1}"),
+                                                &cq).isOK());
+            ASSERT(NULL != cq);
+
+            // Expect to use index {a: 1, b: 1}.
+            QuerySolution* soln = pickBestPlan(cq);
+            ASSERT(QueryPlannerTestLib::solutionMatches(
+                        "{fetch: {node: {ixscan: {pattern: {a: 1, b: 1}}}}}",
+                        soln->root.get()));
+        }
+    };
+
     class All : public Suite {
     public:
         All() : Suite( "query_plan_ranking" ) {}
@@ -744,6 +775,7 @@ namespace PlanRankingTests {
             // add<PlanRankingChooseBetweenIxisectPlans>();
             add<PlanRankingAvoidBlockingSort>();
             add<PlanRankingWorkPlansLongEnough>();
+            add<PlanRankingAccountForKeySkips>();
         }
     } planRankingAll;
 
diff --git a/src/mongo/dbtests/query_stage_and.cpp b/src/mongo/dbtests/query_stage_and.cpp
index a7b77876237..5ebad4fb6e8 100644
--- a/src/mongo/dbtests/query_stage_and.cpp
+++ b/src/mongo/dbtests/query_stage_and.cpp
@@ -802,7 +802,9 @@ namespace QueryStageAnd {
             // This isn't true in general if the collection is not dropped beforehand.
             WorkingSetID id = WorkingSet::INVALID_ID;
 
-            // Sorted AND looks at the first child, which is an index scan over foo==1.
+            // Sorted AND looks at the first child, which is an index scan over foo==1. The
+            // first work() just does initialization, so we have to call work() twice.
+            ah->work(&id);
             ah->work(&id);
 
             // The first thing that the index scan returns (due to increasing DiskLoc trick) is the
author	David Storch <david.storch@10gen.com>	2014-07-02 13:13:54 -0400
committer	David Storch <david.storch@10gen.com>	2014-07-11 13:22:16 -0400
commit	f3ba3590ce1ca1bf8f7fbb13ad8311a52df9c176 (patch)
tree	c81fe778b2e2604ae97b83d22552d19c04b45691 /src
parent	9dbdefb125b07db9d9801b80d525f9a954bec5f2 (diff)
download	mongo-f3ba3590ce1ca1bf8f7fbb13ad8311a52df9c176.tar.gz