diff options
author | David Percy <david.percy@mongodb.com> | 2022-09-26 15:16:37 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-10-26 19:48:19 +0000 |
commit | 136d831a284a5aa7277839c390588ff8f20ab179 (patch) | |
tree | 1a7b6fcf32f003fc614fa2de08f65431e3875d6d /jstests | |
parent | 73fa6daf31440329b55d0971998e8c0359f08392 (diff) | |
download | mongo-136d831a284a5aa7277839c390588ff8f20ab179.tar.gz |
SERVER-68596 [CQF] Convert dotted $elemMatch to SargableNode
The main change is to allow paths like Traverse (ComposeM ...) to
be Sargable. We add a Traverse to each conjunct as if the original
path were ComposeM (Traverse ...) (Traverse ...). This is an over-
approximation so we mark it perf-only and keep the original predicate.
A separate but related improvement: we now make use of more precise
index metadata to remove Traverse nodes. An index on a dotted path
such as {'a.b': 1} may have metadata telling us that 'b' is never
an array, even if 'a' is multikey.
Also, slightly improve lowerPartialSchemaRequirement's ability to
turn ranges back into PathArr / PathObj. This rewrite belongs in the
PartialSchemaReqLowerTransport so that we recognize these intervals
no matter where they occur in the BoolExpr.
Diffstat (limited to 'jstests')
-rw-r--r-- | jstests/cqf/elemmatch_bounds.js | 216 | ||||
-rw-r--r-- | jstests/libs/optimizer_utils.js | 125 | ||||
-rw-r--r-- | jstests/query_golden/expected_output/non_multikey_paths | 51 | ||||
-rw-r--r-- | jstests/query_golden/non_multikey_paths.js | 59 |
4 files changed, 445 insertions, 6 deletions
diff --git a/jstests/cqf/elemmatch_bounds.js b/jstests/cqf/elemmatch_bounds.js new file mode 100644 index 00000000000..45665f2197f --- /dev/null +++ b/jstests/cqf/elemmatch_bounds.js @@ -0,0 +1,216 @@ +/** + * Test that $elemMatch can generate good index bounds. Especially, test different + * ways that $elemMatch can lead to a conjunction inside a traverse. + * + * @tags: [ + * # Includes plan skeleton from explain in the output. + * requires_cqf, + * ] + */ +(function() { +"use strict"; + +load("jstests/libs/optimizer_utils.js"); // For getPlanSkeleton. + +const coll = db.cqf_elemmatch_bounds; +coll.drop(); +let id = 0; +let docs = []; +const numDuplicates = 100; +for (let i = 0; i < numDuplicates; ++i) { + docs = docs.concat([ + // Each 'a' is an object, not an array. + // Each 'c' is the same as 'a.b', which is a mixture of scalars/arrays. + {_id: id++, a: {b: [1, 2, 3]}, c: [1, 2, 3]}, + {_id: id++, a: {b: 1}, c: 1}, + {_id: id++, a: {b: 2}, c: 2}, + {_id: id++, a: {b: 3}, c: 3}, + {_id: id++, a: {b: [1]}, c: [1]}, + {_id: id++, a: {b: [2]}, c: [2]}, + {_id: id++, a: {b: [3]}, c: [3]}, + // [1, 3] is interesting because it satisfies {$gt: 1} and {$lt: 3}, + // but no element satisfies both. + {_id: id++, a: {b: [1, 3]}, c: [1, 3]}, + + // Examples with nested arrays: an array element is immediately another array. + {_id: id++, c: [[1, 2, 3]]}, + {_id: id++, c: [[1, 3]]}, + {_id: id++, c: [[1], [2], [3]]}, + {_id: id++, c: [[1], [3]]}, + {_id: id++, c: [[1]]}, + {_id: id++, c: [[2]]}, + {_id: id++, c: [[3]]}, + ]); +} +for (let i = 0; i < numDuplicates; ++i) { + // Generate more docs where 'Get c Traverse Traverse Eq 2' but not 'Get c Traverse PathArr'. + for (let j = 0; j < 10; ++j) + docs = docs.concat([ + {_id: id++, c: 2}, + ]); +} +for (let i = 0; i < numDuplicates; ++i) { + // Generate non-matching docs to discourage collection scan. + assert.commandWorked(coll.insert(Array.from({length: 100}, () => ({_id: id++})))); +} +assert.commandWorked(coll.insert(docs)); + +assert.commandWorked(coll.createIndex({'a.b': 1})); +assert.commandWorked(coll.createIndex({'c': 1})); + +// Run the pipeline and assert it uses the expected plan. +// Return the results with the _id field excluded. +function run({pipeline, plan: expectedPlan}) { + const explain = coll.explain().aggregate(pipeline); + const plan = getPlanSkeleton(explain.queryPlanner.winningPlan.optimizerPlan, { + extraKeepKeys: ['indexDefName', 'interval', 'filter'], + }); + assert.eq(plan, expectedPlan, plan); + + let result = coll.aggregate(pipeline).toArray(); + for (let doc of result) { + delete doc._id; + } + return result; +} +// Assert that 'doc' occurs exactly 'num' times in 'result'. +function assertCount(result, num, doc) { + let matching = result.filter(d => friendlyEqual(d, doc)); + + let msg = `Expected ${num} occurrences of ${tojsononeline(doc)} but got ${matching.length}.`; + if (matching.length > 0) { + msg += ' For example:\n ' + matching.slice(0, 5).map(d => tojsononeline(d)).join("\n "); + } + assert.eq(matching.length, num, msg); +} + +let result; + +// Test multikey non-$elemMatch $eq predicate includes non-arrays. +result = run({ + pipeline: [{$match: {'a.b': {$eq: 2}}}], + plan: { + "nodeType": "Root", + "child": { + "nodeType": "BinaryJoin", + "leftChild": {"nodeType": "IndexScan", "indexDefName": "a.b_1", "interval": "[ 2, 2 ]"}, + "rightChild": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}} + } + }, +}); +assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]}); +assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]}); +assertCount(result, numDuplicates, {a: {b: 2}, c: 2}); +assert.eq(result.length, numDuplicates * 3); + +// Test $elemMatch only matches arrays (unlike the previous query). +result = run({ + pipeline: [{$match: {'a.b': {$elemMatch: {$eq: 2}}}}], + plan: { + "nodeType": "Root", + "child": { + "nodeType": "Filter", + "filter": + "fillEmpty(traverseF(getField(scan_0, \"a\"), (inputGet_6 -> let inputComposeM_1 = getField(inputGet_6, \"b\") in if fillEmpty(traverseF(inputComposeM_1, (valCmp_2 -> (valCmp_2 == 2)), false), false) then isArray(inputComposeM_1) else false), false), false)", + "child": { + "nodeType": "BinaryJoin", + "leftChild": + {"nodeType": "IndexScan", "indexDefName": "a.b_1", "interval": "[ 2, 2 ]"}, + "rightChild": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}} + } + } + }, +}); +assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]}); +assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]}); +assertCount(result, 0, {a: {b: 2}, c: 2}); // Expect zero non-arrays. +assert.eq(result.length, numDuplicates * 2); + +// Test conjunction inside a top-level elemMatch. +result = run({ + pipeline: [{$match: {c: {$elemMatch: {$gt: 1, $lt: 3}}}}], + plan: { + "nodeType": "Root", + "child": { + "nodeType": "BinaryJoin", + "leftChild": { + "nodeType": "Unique", + "child": {"nodeType": "IndexScan", "indexDefName": "c_1", "interval": "( 1, 3 )"} + }, + "rightChild": { + "nodeType": "Filter", + "filter": "fillEmpty(isArray(evalTemp_4), false)", + "child": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}} + } + } + }, +}); +assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]}); +assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]}); +assert.eq(result.length, numDuplicates * 2); + +// Test conjunction inside a dotted elemMatch. +result = run({ + pipeline: [{$match: {'a.b': {$elemMatch: {$gt: 1, $lt: 3}}}}], + plan: { + "nodeType": "Root", + "child": { + "nodeType": "Filter", + "filter": + "fillEmpty(traverseF(getField(scan_0, \"a\"), (inputGet_6 -> let inputComposeM_8 = getField(inputGet_6, \"b\") in if fillEmpty(traverseF(inputComposeM_8, (inputComposeM_7 -> if fillEmpty(if fillEmpty(((inputComposeM_7 <=> 3) < NumberLong(0)), false) then ((inputComposeM_7 <=> NaN) >= NumberLong(0)) else false, false) then if fillEmpty(((inputComposeM_7 <=> 1) > NumberLong(0)), false) then ((inputComposeM_7 <=> \"\") < NumberLong(0)) else false else false), false), false) then isArray(inputComposeM_8) else false), false), false)", + "child": { + "nodeType": "BinaryJoin", + "leftChild": { + "nodeType": "Unique", + "child": + {"nodeType": "IndexScan", "indexDefName": "a.b_1", "interval": "( 1, 3 )"} + }, + "rightChild": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}} + } + } + }, +}); +assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]}); +assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]}); +assert.eq(result.length, numDuplicates * 2); + +// Nested $elemMatch matches nested arrays, but the bounds only handle PathArr. +// Multikey indexes don't recursively unwind arrays, so the scalars inside the nested array don't +// get separate index entries, so we can't generate a tight interval like [2, 2]. +result = run({ + pipeline: [{$match: {c: {$elemMatch: {$elemMatch: {$eq: 2}}}}}], + plan: { + "nodeType": "Root", + "child": { + "nodeType": "Filter", + "filter": + "fillEmpty(traverseF(getField(scan_0, \"c\"), (inputComposeM_1 -> if fillEmpty(traverseF(inputComposeM_1, (valCmp_3 -> (valCmp_3 == 2)), false), false) then isArray(inputComposeM_1) else false), false), false)", + "child": { + "nodeType": "BinaryJoin", + "leftChild": { + "nodeType": "Unique", + "child": { + "nodeType": "Filter", + "filter": + "fillEmpty(traverseF(evalTemp_6, (valCmp_2 -> (valCmp_2 == 2)), false), false)", + "child": { + "nodeType": "IndexScan", + "indexDefName": "c_1", + "interval": "[ [ ], BinData(0,\"\") )" + } + } + }, + "rightChild": { + "nodeType": "Filter", + "filter": "fillEmpty(isArray(evalTemp_2), false)", + "child": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}} + } + } + } + }, +}); +assertCount(result, numDuplicates, {c: [[1, 2, 3]]}); +assertCount(result, numDuplicates, {c: [[1], [2], [3]]}); +assertCount(result, numDuplicates, {c: [[2]]}); +assert.eq(result.length, numDuplicates * 3); +})();
\ No newline at end of file diff --git a/jstests/libs/optimizer_utils.js b/jstests/libs/optimizer_utils.js index 22483624ad9..bd66520a875 100644 --- a/jstests/libs/optimizer_utils.js +++ b/jstests/libs/optimizer_utils.js @@ -53,7 +53,9 @@ function leftmostLeafStage(node) { /** * Get a very simplified version of a plan, which only includes nodeType and nesting structure. */ -function getPlanSkeleton(node, recursiveKeepKeys = [], addToKeepKeys = []) { +function getPlanSkeleton(node, options = {}) { + const {extraKeepKeys = [], keepKeysDeep = []} = options; + const keepKeys = [ 'nodeType', @@ -64,26 +66,137 @@ function getPlanSkeleton(node, recursiveKeepKeys = [], addToKeepKeys = []) { 'children', 'leftChild', 'rightChild', - ].concat(addToKeepKeys); + ].concat(extraKeepKeys); if (Array.isArray(node)) { - return node.map(n => getPlanSkeleton(n)); + return node.map(n => getPlanSkeleton(n, options)); } else if (node === null || typeof node !== 'object') { return node; } else { return Object.fromEntries( Object.keys(node) - .filter(key => (keepKeys.includes(key) || recursiveKeepKeys.includes(key))) + .filter(key => (keepKeys.includes(key) || keepKeysDeep.includes(key))) .map(key => { - if (recursiveKeepKeys.includes(key)) { + if (keepKeysDeep.includes(key)) { return [key, node[key]]; + } else if (key === 'interval') { + return [key, prettyInterval(node[key])]; + } else if (key === 'filter') { + return [key, prettyExpression(node[key])]; } else { - return [key, getPlanSkeleton(node[key], recursiveKeepKeys, addToKeepKeys)]; + return [key, getPlanSkeleton(node[key], options)]; } })); } } +function prettyInterval(compoundInterval) { + // Takes an array of intervals, each one applying to one component of a compound index key. + // Try to format it as a string. + // If either bound is not Constant, return the original JSON unchanged. + let result = ''; + for (const {lowBound, highBound} of compoundInterval) { + if (lowBound.bound.nodeType !== 'Const' || highBound.bound.nodeType !== 'Const') { + return compoundInterval; + } + + const lowInclusive = lowBound.inclusive; + const highInclusive = highBound.inclusive; + assert.eq(typeof lowInclusive, 'boolean'); + assert.eq(typeof highInclusive, 'boolean'); + + result += ' '; + result += lowInclusive ? '[ ' : '( '; + result += tojson(lowBound.bound.value); + result += ', '; + result += tojson(highBound.bound.value); + result += highInclusive ? ' ]' : ' )'; + } + return result.trim(); +} + +function prettyExpression(expr) { + switch (expr.nodeType) { + case 'Variable': + return expr.name; + case 'Const': + return tojson(expr.value); + case 'FunctionCall': + return `${expr.name}(${expr.arguments.map(a => prettyExpression(a)).join(', ')})`; + case 'If': { + const if_ = prettyExpression(expr.condition); + const then_ = prettyExpression(expr.then); + const else_ = prettyExpression(expr.else); + return `if ${if_} then ${then_} else ${else_}`; + } + case 'Let': { + const x = expr.variable; + const b = prettyExpression(expr.bind); + const e = prettyExpression(expr.expression); + return `let ${x} = ${b} in ${e}`; + } + case 'LambdaAbstraction': { + return `(${expr.variable} -> ${prettyExpression(expr.input)})`; + } + case 'BinaryOp': { + const left = prettyExpression(expr.left); + const right = prettyExpression(expr.right); + const op = prettyOp(expr.op); + return `(${left} ${op} ${right})`; + } + default: + return tojson(expr); + } +} + +function prettyOp(op) { + // See src/mongo/db/query/optimizer/syntax/syntax.h, PATHSYNTAX_OPNAMES. + switch (op) { + /* comparison operations */ + case 'Eq': + return '=='; + case 'EqMember': + return 'in'; + case 'Neq': + return '!='; + case 'Gt': + return '>'; + case 'Gte': + return '>='; + case 'Lt': + return '<'; + case 'Lte': + return '<='; + case 'Cmp3w': + return '<=>'; + + /* binary operations */ + case 'Add': + return '+'; + case 'Sub': + return '-'; + case 'Mult': + return '*'; + case 'Div': + return '/'; + + /* unary operations */ + case 'Neg': + return '-'; + + /* logical operations */ + case 'And': + return 'and'; + case 'Or': + return 'or'; + case 'Not': + return 'not'; + + default: + return op; + } +} + function navigateToPath(doc, path) { let result; let field; diff --git a/jstests/query_golden/expected_output/non_multikey_paths b/jstests/query_golden/expected_output/non_multikey_paths new file mode 100644 index 00000000000..2d6fa83adb8 --- /dev/null +++ b/jstests/query_golden/expected_output/non_multikey_paths @@ -0,0 +1,51 @@ + + +[jsTest] ---- +[jsTest] Query: [ { "$match" : { "one.one.one.one" : 2 } } ] +[jsTest] ---- + +Leaf stage: { + "nodeType" : "IndexScan", + "indexDefName" : "one.one.one.one_1", + "interval" : "[ 2, 2 ]" +} + +[jsTest] ---- +[jsTest] Query: [ { "$match" : { "one.one.one.many" : 2 } } ] +[jsTest] ---- + +Leaf stage: { + "nodeType" : "IndexScan", + "indexDefName" : "one.one.one.many_1", + "interval" : "[ 2, 2 ]" +} + +[jsTest] ---- +[jsTest] Query: [ { "$match" : { "many.one.one.one" : 2 } } ] +[jsTest] ---- + +Leaf stage: { + "nodeType" : "IndexScan", + "indexDefName" : "many.one.one.one_1", + "interval" : "[ 2, 2 ]" +} + +[jsTest] ---- +[jsTest] Query: [ { "$match" : { "many.one.one.many" : 2 } } ] +[jsTest] ---- + +Leaf stage: { + "nodeType" : "IndexScan", + "indexDefName" : "many.one.one.many_1", + "interval" : "[ 2, 2 ]" +} + +[jsTest] ---- +[jsTest] Query: [ { "$match" : { "many.many.many.many" : 2 } } ] +[jsTest] ---- + +Leaf stage: { + "nodeType" : "IndexScan", + "indexDefName" : "many.many.many.many_1", + "interval" : "[ 2, 2 ]" +}
\ No newline at end of file diff --git a/jstests/query_golden/non_multikey_paths.js b/jstests/query_golden/non_multikey_paths.js new file mode 100644 index 00000000000..f0abdf78249 --- /dev/null +++ b/jstests/query_golden/non_multikey_paths.js @@ -0,0 +1,59 @@ +/** + * Test indexes with long paths, where some components are multikey and some are not. + * Make sure queries can use these indexes, with good bounds. + * + * @tags: [ + * # Checks explain. + * requires_cqf, + * ] + */ +(function() { +"use strict"; + +load("jstests/libs/optimizer_utils.js"); // For leftmostLeafStage + +const coll = db.cqf_non_multikey_paths; +coll.drop(); +assert.commandWorked(coll.createIndex({'one.one.one.one': 1})); +assert.commandWorked(coll.createIndex({'one.one.one.many': 1})); +assert.commandWorked(coll.createIndex({'many.one.one.one': 1})); +assert.commandWorked(coll.createIndex({'many.one.one.many': 1})); +assert.commandWorked(coll.createIndex({'many.many.many.many': 1})); +let i = 0; +assert.commandWorked(coll.insert([ + {_id: ++i, one: {one: {one: {one: 2}}}}, + {_id: ++i, one: {one: {one: {many: [1, 2, 3]}}}}, + { + _id: ++i, + many: [ + {one: {one: {one: 1}}}, + {one: {one: {one: 2}}}, + {one: {one: {one: 3}}}, + ] + }, + { + _id: ++i, + many: [ + {one: {one: {many: [1, 2, 3]}}}, + {one: {one: {many: [4, 5]}}}, + ], + }, + {_id: ++i, many: [{many: [{many: [{many: [1, 2, 3]}]}]}]}, +])); +// Generate enough documents for index to be preferable. +assert.commandWorked(coll.insert(Array.from({length: 100}, (_, i) => ({_id: i + 1000})))); + +function run(pipeline) { + jsTestLog(`Query: ${tojsononeline(pipeline)}`); + const explain = coll.explain().aggregate(pipeline); + print(`Leaf stage: `); + const {nodeType, indexDefName, interval} = leftmostLeafStage(explain); + printjson({nodeType, indexDefName, interval: prettyInterval(interval)}); +} + +run([{$match: {'one.one.one.one': 2}}]); +run([{$match: {'one.one.one.many': 2}}]); +run([{$match: {'many.one.one.one': 2}}]); +run([{$match: {'many.one.one.many': 2}}]); +run([{$match: {'many.many.many.many': 2}}]); +})();
\ No newline at end of file |