diff options
author | Ian Boros <puppyofkosh@gmail.com> | 2019-03-12 19:29:10 -0400 |
---|---|---|
committer | Ian Boros <puppyofkosh@gmail.com> | 2019-04-04 13:50:12 -0400 |
commit | eda597a9970ae8f1f7a4c4ede0c0f5802c612d35 (patch) | |
tree | 0565a92725c3e710fd680b964d92eefed011f165 | |
parent | 54fd726353b8cf6d4ba1a78a1dc7eb035d1fb39f (diff) | |
download | mongo-eda597a9970ae8f1f7a4c4ede0c0f5802c612d35.tar.gz |
SERVER-13298 distinct now uses index for multikey dotted fields
4 files changed, 209 insertions, 5 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml index e4c1e157da7..9020a33489b 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_multi_stmt_txn_jscore_passthrough.yml @@ -92,6 +92,7 @@ selector: - jstests/core/agg_hint.js - jstests/core/and.js - jstests/core/collation.js + - jstests/core/distinct_multikey_dotted_path.js - jstests/core/explain_shell_helpers.js - jstests/core/index_partial_read_ops.js - jstests/core/optimized_match_explain.js diff --git a/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml index edbdb2bbf13..6c1999c77f9 100644 --- a/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/sharded_multi_stmt_txn_jscore_passthrough.yml @@ -140,6 +140,7 @@ selector: - jstests/core/agg_hint.js - jstests/core/and.js - jstests/core/collation.js + - jstests/core/distinct_multikey_dotted_path.js - jstests/core/explain_shell_helpers.js - jstests/core/index_partial_read_ops.js - jstests/core/optimized_match_explain.js diff --git a/jstests/core/distinct_multikey_dotted_path.js b/jstests/core/distinct_multikey_dotted_path.js new file mode 100644 index 00000000000..7b2bee44545 --- /dev/null +++ b/jstests/core/distinct_multikey_dotted_path.js @@ -0,0 +1,207 @@ +/** + * Test distinct() on multikey indexes using a dotted path. + * + * Assumes the collection is not sharded, because sharding the collection could result in different + * plans being chosen on different shards (for example, if an index is multikey on one shard but + * not another). + * Doesn't support stepdowns because it runs explain() on an aggregation (which can apparently + * return incomplete results). + * @tags: [assumes_unsharded_collection, does_not_support_stepdowns] + */ +(function() { + "use strict"; + load("jstests/libs/analyze_plan.js"); // For planHasStage(). + + const coll = db.distinct_multikey; + coll.drop(); + assert.commandWorked(coll.createIndex({"a.b.c": 1})); + + assert.commandWorked(coll.insert({a: {b: {c: 1}}})); + assert.commandWorked(coll.insert({a: {b: {c: 2}}})); + assert.commandWorked(coll.insert({a: {b: {c: 3}}})); + assert.commandWorked(coll.insert({a: {b: {notRelevant: 3}}})); + assert.commandWorked(coll.insert({a: {notRelevant: 3}})); + + const numPredicate = {"a.b.c": {$gt: 0}}; + + function getAggPipelineForDistinct(path) { + return [{$group: {_id: "$" + path}}]; + } + + // Run an agg pipeline with a $group, and convert the results so they're equivalent + // to what a distinct() would return. + function distinctResultsFromPipeline(pipeline) { + const res = coll.aggregate(pipeline).toArray(); + return res.map((x) => x._id); + } + + // Be sure a distinct scan is used when the index is not multi key. + (function testDistinctWithNonMultikeyIndex() { + const results = coll.distinct("a.b.c"); + // TODO SERVER-14832: Returning 'null' here is inconsistent with the behavior when no index + // is present. + assert.sameMembers([1, 2, 3, null], results); + + const expl = coll.explain().distinct("a.b.c"); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl); + + // Do an equivalent query using $group. + const pipeline = getAggPipelineForDistinct("a.b.c"); + const aggResults = distinctResultsFromPipeline(pipeline); + assert.sameMembers(aggResults, results); + const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline)); + assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0); + })(); + + // Distinct with a predicate. + (function testDistinctWithPredWithNonMultikeyIndex() { + const results = coll.distinct("a.b.c", numPredicate); + assert.sameMembers([1, 2, 3], results); + + const expl = coll.explain().distinct("a.b.c", numPredicate); + + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl); + + // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results + // from the aggregation with a predicate (a $match preceeding the $group) may not match the + // results from distinct() with a predicate. + })(); + + // Make the index multi key. + assert.commandWorked(coll.insert({a: {b: [{c: 4}, {c: 5}]}})); + assert.commandWorked(coll.insert({a: {b: [{c: 4}, {c: 6}]}})); + // Empty array is indexed as 'undefined'. + assert.commandWorked(coll.insert({a: {b: {c: []}}})); + + // We should still use the index as long as the path we distinct() on is never an array + // index. + (function testDistinctWithMultikeyIndex() { + const multiKeyResults = coll.distinct("a.b.c"); + // TODO SERVER-14832: Returning 'null' and 'undefined' here is inconsistent with the + // behavior when no index is present. + assert.sameMembers([1, 2, 3, 4, 5, 6, null, undefined], multiKeyResults); + const expl = coll.explain().distinct("a.b.c"); + + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN")); + + // Do the same thing with aggregation. + const pipeline = getAggPipelineForDistinct("a.b.c"); + const aggResults = distinctResultsFromPipeline(pipeline); + assert.sameMembers(aggResults, multiKeyResults); + const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline)); + assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0); + })(); + + // We cannot use the DISTINCT_SCAN optimization when there is a multikey path in the key and + // there is a predicate. The reason is that we may have a predicate like {a: 4}, and two + // documents: {a: [4, 5]}, {a: [4, 6]}. With a DISTINCT_SCAN, we would "skip over" one of the + // documents, and leave out either '5' or '6', rather than providing the correct result of + // [4, 5, 6]. The test below is for a similar case. + (function testDistinctWithPredWithMultikeyIndex() { + const pred = {"a.b.c": 4}; + const results = coll.distinct("a.b.c", pred); + assert.sameMembers([4, 5, 6], results); + + const expl = coll.explain().distinct("a.b.c", pred); + assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"), expl); + + // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results + // from the aggregation with a predicate (a $match preceeding the $group) may not match the + // results from distinct() with a predicate. + })(); + + // Perform a distinct on a path where the last component is multikey. + (function testDistinctOnPathWhereLastComponentIsMultiKey() { + assert.commandWorked(coll.createIndex({"a.b": 1})); + const multiKeyResults = coll.distinct("a.b"); + assert.sameMembers( + [ + null, // From the document with no 'b' field. TODO SERVER-14832: this is + // inconsistent with behavior when no index is present. + {c: 1}, + {c: 2}, + {c: 3}, + {c: 4}, + {c: 5}, + {c: 6}, + {c: []}, + {notRelevant: 3} + ], + multiKeyResults); + + const expl = coll.explain().distinct("a.b"); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN")); + + // Do the same thing with aggregation. + const pipeline = getAggPipelineForDistinct("a.b"); + const aggResults = distinctResultsFromPipeline(pipeline); + assert.sameMembers(aggResults, multiKeyResults); + const aggExpl = assert.commandWorked(coll.explain().aggregate(pipeline)); + assert.gt(getAggPlanStages(aggExpl, "DISTINCT_SCAN").length, 0); + })(); + + (function testDistinctOnPathWhereLastComponentIsMultiKeyWithPredicate() { + assert.commandWorked(coll.createIndex({"a.b": 1})); + const pred = {"a.b": {$type: "array"}}; + const multiKeyResults = coll.distinct("a.b", pred); + assert.sameMembers( + [ + {c: 4}, + {c: 5}, + {c: 6}, + ], + multiKeyResults); + + const expl = coll.explain().distinct("a.b", pred); + assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN")); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN")); + + // TODO: SERVER-40465 not checking the results for an equivalent aggregation, since results + // from the aggregation with a predicate (a $match preceeding the $group) may not match the + // results from distinct() with a predicate. + })(); + + // If the path we distinct() on includes an array index, a COLLSCAN should be used, + // even if an index is available on the prefix to the array component ("a.b" in this case). + (function testDistinctOnNumericMultikeyPathNoIndex() { + const res = coll.distinct("a.b.0"); + assert.eq(res, [{c: 4}]); + + const expl = coll.explain().distinct("a.b.0"); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "COLLSCAN"), expl); + + // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will + // only treat '0' as a field name (not array index). + })(); + + // Creating an index on "a.b.0" and doing a distinct on it should be able to use DISTINCT_SCAN. + (function testDistinctOnNumericMultikeyPathWithIndex() { + assert.commandWorked(coll.createIndex({"a.b.0": 1})); + assert.commandWorked(coll.insert({a: {b: {0: "hello world"}}})); + const res = coll.distinct("a.b.0"); + assert.sameMembers(res, [{c: 4}, "hello world"]); + + const expl = coll.explain().distinct("a.b.0"); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl); + + // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will + // only treat '0' as a field name (not array index). + })(); + + // Creating an index on "a.b.0" and doing a distinct on it should use an IXSCAN, as "a.b" is + // multikey. See explanation above about why a DISTINCT_SCAN cannot be used when the path + // given is multikey. + (function testDistinctWithPredOnNumericMultikeyPathWithIndex() { + const pred = {"a.b.0": {$type: "object"}}; + const res = coll.distinct("a.b.0", pred); + assert.sameMembers(res, [{c: 4}]); + + const expl = coll.explain().distinct("a.b.0", pred); + assert.eq(false, planHasStage(db, expl.queryPlanner.winningPlan, "DISTINCT_SCAN"), expl); + assert.eq(true, planHasStage(db, expl.queryPlanner.winningPlan, "IXSCAN"), expl); + + // Will not attempt the equivalent query with aggregation, since $group by "a.b.0" will + // only treat '0' as a field name (not array index). + })(); +})(); diff --git a/src/mongo/db/query/get_executor.cpp b/src/mongo/db/query/get_executor.cpp index 62517e5a0da..03339229f1e 100644 --- a/src/mongo/db/query/get_executor.cpp +++ b/src/mongo/db/query/get_executor.cpp @@ -1151,7 +1151,6 @@ bool getDistinctNodeIndex(const std::vector<IndexEntry>& indices, const CollatorInterface* collator, size_t* indexOut) { invariant(indexOut); - bool isDottedField = str::contains(field, '.'); int minFields = std::numeric_limits<int>::max(); for (size_t i = 0; i < indices.size(); ++i) { // Skip indices with non-matching collator. @@ -1166,10 +1165,6 @@ bool getDistinctNodeIndex(const std::vector<IndexEntry>& indices, if (indices[i].filterExpr) { continue; } - // Skip multikey indices if we are projecting on a dotted field. - if (indices[i].multikey && isDottedField) { - continue; - } // Skip indices where the first key is not field. if (indices[i].keyPattern.firstElement().fieldNameStringData() != StringData(field)) { continue; |