summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIan Boros <puppyofkosh@gmail.com>2019-03-12 19:29:10 -0400
committerIan Boros <puppyofkosh@gmail.com>2019-04-04 13:50:12 -0400
commiteda597a9970ae8f1f7a4c4ede0c0f5802c612d35 (patch)
tree0565a92725c3e710fd680b964d92eefed011f165
parent54fd726353b8cf6d4ba1a78a1dc7eb035d1fb39f (diff)
downloadmongo-eda597a9970ae8f1f7a4c4ede0c0f5802c612d35.tar.gz
SERVER-13298 distinct now uses index for multikey dotted fields
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml1
-rw-r--r--buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml1
-rw-r--r--jstests/core/distinct_multikey_dotted_path.js207
-rw-r--r--src/mongo/db/query/get_executor.cpp5
4 files changed, 209 insertions, 5 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml
index e4c1e157da7..9020a33489b 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml
@@ -92,6 +92,7 @@ selector:
- jstests/core/agg_hint.js
- jstests/core/and.js
- jstests/core/collation.js
+ - jstests/core/distinct_multikey_dotted_path.js
- jstests/core/explain_shell_helpers.js
- jstests/core/index_partial_read_ops.js
- jstests/core/optimized_match_explain.js
diff --git a/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml
index edbdb2bbf13..6c1999c77f9 100644
--- a/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml
@@ -140,6 +140,7 @@ selector:
- jstests/core/agg_hint.js
- jstests/core/and.js
- jstests/core/collation.js
+ - jstests/core/distinct_multikey_dotted_path.js
- jstests/core/explain_shell_helpers.js
- jstests/core/index_partial_read_ops.js
- jstests/core/optimized_match_explain.js
diff --git a/jstests/core/distinct_multikey_dotted_path.js b/jstests/core/distinct_multikey_dotted_path.js
new file mode 100644
index 00000000000..7b2bee44545
--- /dev/null
+++ b/jstests/core/distinct_multikey_dotted_path.js
@@ -0,0 +1,207 @@
+/**
+ * Test distinct() on multikey indexes using a dotted path.
+ *
+ * Assumes the collection is not sharded, because sharding the collection could result in different
+ * plans being chosen on different shards (for example, if an index is multikey on one shard but
+ * not another).
+ * Doesn't support stepdowns because it runs explain() on an aggregation (which can apparently
+ * return incomplete results).
+ * @tags: [assumes_unsharded_collection, does_not_support_stepdowns]
+ */
+(function() {
+ "use strict";
+ load("jstests/libs/analyze_plan.js"); // For planHasStage().
+
+ const coll = db.distinct_multikey;
+ coll.drop();
+ assert.commandWorked(coll.createIndex({"a.b.c": 1}));
+
+ assert.commandWorked(coll.insert({a: {b: {c: 1}}}));
+ assert.commandWorked(coll.insert({a: {b: {c: 2}}}));
+ assert.commandWorked(coll.insert({a: {b: {c: 3}}}));
+ assert.commandWorked(coll.insert({a: {b: {notRelevant: 3}}}));
+ assert.commandWorked(coll.insert({a: {notRelevant: 3}}));
+
+ const numPredicate = {"a.b.c": {$gt: 0}};
+
+ function getAggPipelineForDistinct(path) {
+ return [{$group: {_id: "$" + path}}];
+ }
+
+ // Run an agg pipeline with a $group, and convert the results so they're equivalent
+ // to what a distinct() would return.
+ function distinctResultsFromPipeline(pipeline) {
+ const res = coll.aggregate(pipeline).toArray();
+ return res.map((x) => x._id);
+ }
+
+ // Be sure a distinct scan is used when the index is not multi key.
+ (function testDistinctWithNonMultikeyIndex() {
+ const results = coll.distinct("a.b.c");
+ // TODO SERVER-14832: Returning 'null' here is inconsistent with the behavior when no index
+ // is present.
+ assert.sameMembers([1, 2, 3, null], results);
+
+ const expl = coll.explain().distinct("a.b.c");
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+
+ // Do an equivalent query using $group.
+ const pipeline = getAggPipelineForDistinct("a.b.c");
+ const aggResults = distinctResultsFromPipeline(pipeline);
+ assert.sameMembers(aggResults, results);
+ const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline));
+ assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0);
+ })();
+
+ // Distinct with a predicate.
+ (function testDistinctWithPredWithNonMultikeyIndex() {
+ const results = coll.distinct("a.b.c", numPredicate);
+ assert.sameMembers([1, 2, 3], results);
+
+ const expl = coll.explain().distinct("a.b.c", numPredicate);
+
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+
+ // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results
+ // from the aggregation with a predicate (a $match preceeding the $group) may not match the
+ // results from distinct() with a predicate.
+ })();
+
+ // Make the index multi key.
+ assert.commandWorked(coll.insert({a: {b: [{c: 4}, {c: 5}]}}));
+ assert.commandWorked(coll.insert({a: {b: [{c: 4}, {c: 6}]}}));
+ // Empty array is indexed as 'undefined'.
+ assert.commandWorked(coll.insert({a: {b: {c: []}}}));
+
+ // We should still use the index as long as the path we distinct() on is never an array
+ // index.
+ (function testDistinctWithMultikeyIndex() {
+ const multiKeyResults = coll.distinct("a.b.c");
+ // TODO SERVER-14832: Returning 'null' and 'undefined' here is inconsistent with the
+ // behavior when no index is present.
+ assert.sameMembers([1, 2, 3, 4, 5, 6, null, undefined], multiKeyResults);
+ const expl = coll.explain().distinct("a.b.c");
+
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"));
+
+ // Do the same thing with aggregation.
+ const pipeline = getAggPipelineForDistinct("a.b.c");
+ const aggResults = distinctResultsFromPipeline(pipeline);
+ assert.sameMembers(aggResults, multiKeyResults);
+ const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline));
+ assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0);
+ })();
+
+ // We cannot use the DISTINCT_SCAN optimization when there is a multikey path in the key and
+ // there is a predicate. The reason is that we may have a predicate like {a: 4}, and two
+ // documents: {a: [4, 5]}, {a: [4, 6]}. With a DISTINCT_SCAN, we would "skip over" one of the
+ // documents, and leave out either '5' or '6', rather than providing the correct result of
+ // [4, 5, 6]. The test below is for a similar case.
+ (function testDistinctWithPredWithMultikeyIndex() {
+ const pred = {"a.b.c": 4};
+ const results = coll.distinct("a.b.c", pred);
+ assert.sameMembers([4, 5, 6], results);
+
+ const expl = coll.explain().distinct("a.b.c", pred);
+ assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"), expl);
+
+ // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results
+ // from the aggregation with a predicate (a $match preceeding the $group) may not match the
+ // results from distinct() with a predicate.
+ })();
+
+ // Perform a distinct on a path where the last component is multikey.
+ (function testDistinctOnPathWhereLastComponentIsMultiKey() {
+ assert.commandWorked(coll.createIndex({"a.b": 1}));
+ const multiKeyResults = coll.distinct("a.b");
+ assert.sameMembers(
+ [
+ null, // From the document with no 'b' field. TODO SERVER-14832: this is
+ // inconsistent with behavior when no index is present.
+ {c: 1},
+ {c: 2},
+ {c: 3},
+ {c: 4},
+ {c: 5},
+ {c: 6},
+ {c: []},
+ {notRelevant: 3}
+ ],
+ multiKeyResults);
+
+ const expl = coll.explain().distinct("a.b");
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"));
+
+ // Do the same thing with aggregation.
+ const pipeline = getAggPipelineForDistinct("a.b");
+ const aggResults = distinctResultsFromPipeline(pipeline);
+ assert.sameMembers(aggResults, multiKeyResults);
+ const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline));
+ assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0);
+ })();
+
+ (function testDistinctOnPathWhereLastComponentIsMultiKeyWithPredicate() {
+ assert.commandWorked(coll.createIndex({"a.b": 1}));
+ const pred = {"a.b": {$type: "array"}};
+ const multiKeyResults = coll.distinct("a.b", pred);
+ assert.sameMembers(
+ [
+ {c: 4},
+ {c: 5},
+ {c: 6},
+ ],
+ multiKeyResults);
+
+ const expl = coll.explain().distinct("a.b", pred);
+ assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"));
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"));
+
+ // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results
+ // from the aggregation with a predicate (a $match preceeding the $group) may not match the
+ // results from distinct() with a predicate.
+ })();
+
+ // If the path we distinct() on includes an array index, a COLLSCAN should be used,
+ // even if an index is available on the prefix to the array component ("a.b" in this case).
+ (function testDistinctOnNumericMultikeyPathNoIndex() {
+ const res = coll.distinct("a.b.0");
+ assert.eq(res, [{c: 4}]);
+
+ const expl = coll.explain().distinct("a.b.0");
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "COLLSCAN"), expl);
+
+ // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will
+ // only treat '0' as a field name (not array index).
+ })();
+
+ // Creating an index on "a.b.0" and doing a distinct on it should be able to use DISTINCT_SCAN.
+ (function testDistinctOnNumericMultikeyPathWithIndex() {
+ assert.commandWorked(coll.createIndex({"a.b.0": 1}));
+ assert.commandWorked(coll.insert({a: {b: {0: "hello world"}}}));
+ const res = coll.distinct("a.b.0");
+ assert.sameMembers(res, [{c: 4}, "hello world"]);
+
+ const expl = coll.explain().distinct("a.b.0");
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+
+ // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will
+ // only treat '0' as a field name (not array index).
+ })();
+
+ // Creating an index on "a.b.0" and doing a distinct on it should use an IXSCAN, as "a.b" is
+ // multikey. See explanation above about why a DISTINCT_SCAN cannot be used when the path
+ // given is multikey.
+ (function testDistinctWithPredOnNumericMultikeyPathWithIndex() {
+ const pred = {"a.b.0": {$type: "object"}};
+ const res = coll.distinct("a.b.0", pred);
+ assert.sameMembers(res, [{c: 4}]);
+
+ const expl = coll.explain().distinct("a.b.0", pred);
+ assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl);
+ assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"), expl);
+
+ // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will
+ // only treat '0' as a field name (not array index).
+ })();
+})();
diff --git a/src/mongo/db/query/get_executor.cpp b/src/mongo/db/query/get_executor.cpp
index 62517e5a0da..03339229f1e 100644
--- a/src/mongo/db/query/get_executor.cpp
+++ b/src/mongo/db/query/get_executor.cpp
@@ -1151,7 +1151,6 @@ bool getDistinctNodeIndex(const std::vector<IndexEntry>& indices,
const CollatorInterface* collator,
size_t* indexOut) {
invariant(indexOut);
- bool isDottedField = str::contains(field, '.');
int minFields = std::numeric_limits<int>::max();
for (size_t i = 0; i < indices.size(); ++i) {
// Skip indices with non-matching collator.
@@ -1166,10 +1165,6 @@ bool getDistinctNodeIndex(const std::vector<IndexEntry>& indices,
if (indices[i].filterExpr) {
continue;
}
- // Skip multikey indices if we are projecting on a dotted field.
- if (indices[i].multikey && isDottedField) {
- continue;
- }
// Skip indices where the first key is not field.
if (indices[i].keyPattern.firstElement().fieldNameStringData() != StringData(field)) {
continue;