SERVER-13298 distinct now uses index for multikey dotted fields

author: Ian Boros <puppyofkosh@gmail.com> 2019-03-12 19:29:10 -0400
committer: Ian Boros <puppyofkosh@gmail.com> 2019-04-04 13:50:12 -0400
commit: eda597a9970ae8f1f7a4c4ede0c0f5802c612d35 (patch)
tree: 0565a92725c3e710fd680b964d92eefed011f165
parent: 54fd726353b8cf6d4ba1a78a1dc7eb035d1fb39f (diff)
download: mongo-eda597a9970ae8f1f7a4c4ede0c0f5802c612d35.tar.gz
4 files changed, 209 insertions, 5 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml
index e4c1e157da7..9020a33489b 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml
@@ -92,6 +92,7 @@ selector:
   - jstests/core/agg_hint.js
   - jstests/core/and.js
   - jstests/core/collation.js
+  - jstests/core/distinct_multikey_dotted_path.js
   - jstests/core/explain_shell_helpers.js
   - jstests/core/index_partial_read_ops.js
   - jstests/core/optimized_match_explain.js
diff --git a/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml
index edbdb2bbf13..6c1999c77f9 100644
--- a/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml
@@ -140,6 +140,7 @@ selector:
   - jstests/core/agg_hint.js
   - jstests/core/and.js
   - jstests/core/collation.js
+  - jstests/core/distinct_multikey_dotted_path.js
   - jstests/core/explain_shell_helpers.js
   - jstests/core/index_partial_read_ops.js
   - jstests/core/optimized_match_explain.js
diff --git a/jstests/core/distinct_multikey_dotted_path.js b/jstests/core/distinct_multikey_dotted_path.js
new file mode 100644
index 00000000000..7b2bee44545
--- /dev/null
+++ b/jstests/core/distinct_multikey_dotted_path.js
@@ -0,0 +1,207 @@
+/**
+ * Test distinct() on multikey indexes using a dotted path.
+ *
+ * Assumes the collection is not sharded, because sharding the collection could result in different
+ * plans being chosen on different shards (for example, if an index is multikey on one shard but
+ * not another).
+ * Doesn't support stepdowns because it runs explain() on an aggregation (which can apparently
+ * return incomplete results).
+ * @tags: [assumes_unsharded_collection, does_not_support_stepdowns]
+ */
+(function() {
+    "use strict";
+    load("jstests/libs/analyze_plan.js");  // For planHasStage().
+
+    const coll = db.distinct_multikey;
+    coll.drop();
+    assert.commandWorked(coll.createIndex({"a.b.c": 1}));
+
+    assert.commandWorked(coll.insert({a: {b: {c: 1}}}));
+    assert.commandWorked(coll.insert({a: {b: {c: 2}}}));
+    assert.commandWorked(coll.insert({a: {b: {c: 3}}}));
+    assert.commandWorked(coll.insert({a: {b: {notRelevant: 3}}}));
+    assert.commandWorked(coll.insert({a: {notRelevant: 3}}));
+
+    const numPredicate = {"a.b.c": {$gt: 0}};
+
+    function getAggPipelineForDistinct(path) {
+        return [{$group: {_id: "$" + path}}];
+    }
+
+    // Run an agg pipeline with a $group, and convert the results so they're equivalent
+    // to what a distinct() would return.
+    function distinctResultsFromPipeline(pipeline) {
+        const res = coll.aggregate(pipeline).toArray();
+        return res.map((x) => x._id);
+    }
+
+    // Be sure a distinct scan is used when the index is not multi key.
+    (function testDistinctWithNonMultikeyIndex() {
+        const results = coll.distinct("a.b.c");
+        // TODO SERVER-14832: Returning 'null' here is inconsistent with the behavior when no index
+        // is present.
+        assert.sameMembers([1, 2, 3, null], results);
+
+        const expl = coll.explain().distinct("a.b.c");
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+
+        // Do an equivalent query using $group.
+        const pipeline = getAggPipelineForDistinct("a.b.c");
+        const aggResults = distinctResultsFromPipeline(pipeline);
+        assert.sameMembers(aggResults, results);
+        const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline));
+        assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0);
+    })();
+
+    // Distinct with a predicate.
+    (function testDistinctWithPredWithNonMultikeyIndex() {
+        const results = coll.distinct("a.b.c", numPredicate);
+        assert.sameMembers([1, 2, 3], results);
+
+        const expl = coll.explain().distinct("a.b.c", numPredicate);
+
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+
+        // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results
+        // from the aggregation with a predicate (a $match preceeding the $group) may not match the
+        // results from distinct() with a predicate.
+    })();
+
+    // Make the index multi key.
+    assert.commandWorked(coll.insert({a: {b: [{c: 4}, {c: 5}]}}));
+    assert.commandWorked(coll.insert({a: {b: [{c: 4}, {c: 6}]}}));
+    // Empty array is indexed as 'undefined'.
+    assert.commandWorked(coll.insert({a: {b: {c: []}}}));
+
+    // We should still use the index as long as the path we distinct() on is never an array
+    // index.
+    (function testDistinctWithMultikeyIndex() {
+        const multiKeyResults = coll.distinct("a.b.c");
+        // TODO SERVER-14832: Returning 'null' and 'undefined' here is inconsistent with the
+        // behavior when no index is present.
+        assert.sameMembers([1, 2, 3, 4, 5, 6, null, undefined], multiKeyResults);
+        const expl = coll.explain().distinct("a.b.c");
+
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"));
+
+        // Do the same thing with aggregation.
+        const pipeline = getAggPipelineForDistinct("a.b.c");
+        const aggResults = distinctResultsFromPipeline(pipeline);
+        assert.sameMembers(aggResults, multiKeyResults);
+        const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline));
+        assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0);
+    })();
+
+    // We cannot use the DISTINCT_SCAN optimization when there is a multikey path in the key and
+    // there is a predicate. The reason is that we may have a predicate like {a: 4}, and two
+    // documents: {a: [4, 5]}, {a: [4, 6]}. With a DISTINCT_SCAN, we would "skip over" one of the
+    // documents, and leave out either '5' or '6', rather than providing the correct result of
+    // [4, 5, 6]. The test below is for a similar case.
+    (function testDistinctWithPredWithMultikeyIndex() {
+        const pred = {"a.b.c": 4};
+        const results = coll.distinct("a.b.c", pred);
+        assert.sameMembers([4, 5, 6], results);
+
+        const expl = coll.explain().distinct("a.b.c", pred);
+        assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"), expl);
+
+        // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results
+        // from the aggregation with a predicate (a $match preceeding the $group) may not match the
+        // results from distinct() with a predicate.
+    })();
+
+    // Perform a distinct on a path where the last component is multikey.
+    (function testDistinctOnPathWhereLastComponentIsMultiKey() {
+        assert.commandWorked(coll.createIndex({"a.b": 1}));
+        const multiKeyResults = coll.distinct("a.b");
+        assert.sameMembers(
+            [
+              null,  // From the document with no 'b' field. TODO SERVER-14832: this is
+                     // inconsistent with behavior when no index is present.
+              {c: 1},
+              {c: 2},
+              {c: 3},
+              {c: 4},
+              {c: 5},
+              {c: 6},
+              {c: []},
+              {notRelevant: 3}
+            ],
+            multiKeyResults);
+
+        const expl = coll.explain().distinct("a.b");
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"));
+
+        // Do the same thing with aggregation.
+        const pipeline = getAggPipelineForDistinct("a.b");
+        const aggResults = distinctResultsFromPipeline(pipeline);
+        assert.sameMembers(aggResults, multiKeyResults);
+        const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline));
+        assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0);
+    })();
+
+    (function testDistinctOnPathWhereLastComponentIsMultiKeyWithPredicate() {
+        assert.commandWorked(coll.createIndex({"a.b": 1}));
+        const pred = {"a.b": {$type: "array"}};
+        const multiKeyResults = coll.distinct("a.b", pred);
+        assert.sameMembers(
+            [
+              {c: 4},
+              {c: 5},
+              {c: 6},
+            ],
+            multiKeyResults);
+
+        const expl = coll.explain().distinct("a.b", pred);
+        assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"));
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"));
+
+        // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results
+        // from the aggregation with a predicate (a $match preceeding the $group) may not match the
+        // results from distinct() with a predicate.
+    })();
+
+    // If the path we distinct() on includes an array index, a COLLSCAN should be used,
+    // even if an index is available on the prefix to the array component ("a.b" in this case).
+    (function testDistinctOnNumericMultikeyPathNoIndex() {
+        const res = coll.distinct("a.b.0");
+        assert.eq(res, [{c: 4}]);
+
+        const expl = coll.explain().distinct("a.b.0");
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "COLLSCAN"), expl);
+
+        // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will
+        // only treat '0' as a field name (not array index).
+    })();
+
+    // Creating an index on "a.b.0" and doing a distinct on it should be able to use DISTINCT_SCAN.
+    (function testDistinctOnNumericMultikeyPathWithIndex() {
+        assert.commandWorked(coll.createIndex({"a.b.0": 1}));
+        assert.commandWorked(coll.insert({a: {b: {0: "hello world"}}}));
+        const res = coll.distinct("a.b.0");
+        assert.sameMembers(res, [{c: 4}, "hello world"]);
+
+        const expl = coll.explain().distinct("a.b.0");
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+
+        // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will
+        // only treat '0' as a field name (not array index).
+    })();
+
+    // Creating an index on "a.b.0" and doing a distinct on it should use an IXSCAN, as "a.b" is
+    // multikey. See explanation above about why a DISTINCT_SCAN cannot be used when the path
+    // given is multikey.
+    (function testDistinctWithPredOnNumericMultikeyPathWithIndex() {
+        const pred = {"a.b.0": {$type: "object"}};
+        const res = coll.distinct("a.b.0", pred);
+        assert.sameMembers(res, [{c: 4}]);
+
+        const expl = coll.explain().distinct("a.b.0", pred);
+        assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+        assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"), expl);
+
+        // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will
+        // only treat '0' as a field name (not array index).
+    })();
+})();
diff --git a/src/mongo/db/query/get_executor.cpp b/src/mongo/db/query/get_executor.cpp
index 62517e5a0da..03339229f1e 100644
--- a/src/mongo/db/query/get_executor.cpp
+++ b/src/mongo/db/query/get_executor.cpp
@@ -1151,7 +1151,6 @@ bool getDistinctNodeIndex(const std::vector<IndexEntry>& indices,
                           const CollatorInterface* collator,
                           size_t* indexOut) {
     invariant(indexOut);
-    bool isDottedField = str::contains(field, '.');
     int minFields = std::numeric_limits<int>::max();
     for (size_t i = 0; i < indices.size(); ++i) {
         // Skip indices with non-matching collator.
@@ -1166,10 +1165,6 @@ bool getDistinctNodeIndex(const std::vector<IndexEntry>& indices,
         if (indices[i].filterExpr) {
             continue;
         }
-        // Skip multikey indices if we are projecting on a dotted field.
-        if (indices[i].multikey && isDottedField) {
-            continue;
-        }
         // Skip indices where the first key is not field.
         if (indices[i].keyPattern.firstElement().fieldNameStringData() != StringData(field)) {
             continue;
author	Ian Boros <puppyofkosh@gmail.com>	2019-03-12 19:29:10 -0400
committer	Ian Boros <puppyofkosh@gmail.com>	2019-04-04 13:50:12 -0400
commit	eda597a9970ae8f1f7a4c4ede0c0f5802c612d35 (patch)
tree	0565a92725c3e710fd680b964d92eefed011f165
parent	54fd726353b8cf6d4ba1a78a1dc7eb035d1fb39f (diff)
download	mongo-eda597a9970ae8f1f7a4c4ede0c0f5802c612d35.tar.gz