SERVER-68596 [CQF] Convert dotted $elemMatch to SargableNode

The main change is to allow paths like Traverse (ComposeM ...) to be Sargable. We add a Traverse to each conjunct as if the original path were ComposeM (Traverse ...) (Traverse ...). This is an over- approximation so we mark it perf-only and keep the original predicate. A separate but related improvement: we now make use of more precise index metadata to remove Traverse nodes. An index on a dotted path such as {'a.b': 1} may have metadata telling us that 'b' is never an array, even if 'a' is multikey. Also, slightly improve lowerPartialSchemaRequirement's ability to turn ranges back into PathArr / PathObj. This rewrite belongs in the PartialSchemaReqLowerTransport so that we recognize these intervals no matter where they occur in the BoolExpr.
author: David Percy <david.percy@mongodb.com> 2022-09-26 15:16:37 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-10-26 19:48:19 +0000
commit: 136d831a284a5aa7277839c390588ff8f20ab179 (patch)
tree: 1a7b6fcf32f003fc614fa2de08f65431e3875d6d /jstests
parent: 73fa6daf31440329b55d0971998e8c0359f08392 (diff)
download: mongo-136d831a284a5aa7277839c390588ff8f20ab179.tar.gz
4 files changed, 445 insertions, 6 deletions
diff --git a/jstests/cqf/elemmatch_bounds.js b/jstests/cqf/elemmatch_bounds.js
new file mode 100644
index 00000000000..45665f2197f
--- /dev/null
+++ b/jstests/cqf/elemmatch_bounds.js
@@ -0,0 +1,216 @@
+/**
+ * Test that $elemMatch can generate good index bounds. Especially, test different
+ * ways that $elemMatch can lead to a conjunction inside a traverse.
+ *
+ * @tags: [
+ *   # Includes plan skeleton from explain in the output.
+ *   requires_cqf,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/optimizer_utils.js");  // For getPlanSkeleton.
+
+const coll = db.cqf_elemmatch_bounds;
+coll.drop();
+let id = 0;
+let docs = [];
+const numDuplicates = 100;
+for (let i = 0; i < numDuplicates; ++i) {
+    docs = docs.concat([
+        // Each 'a' is an object, not an array.
+        // Each 'c' is the same as 'a.b', which is a mixture of scalars/arrays.
+        {_id: id++, a: {b: [1, 2, 3]}, c: [1, 2, 3]},
+        {_id: id++, a: {b: 1}, c: 1},
+        {_id: id++, a: {b: 2}, c: 2},
+        {_id: id++, a: {b: 3}, c: 3},
+        {_id: id++, a: {b: [1]}, c: [1]},
+        {_id: id++, a: {b: [2]}, c: [2]},
+        {_id: id++, a: {b: [3]}, c: [3]},
+        // [1, 3] is interesting because it satisfies {$gt: 1} and {$lt: 3},
+        // but no element satisfies both.
+        {_id: id++, a: {b: [1, 3]}, c: [1, 3]},
+
+        // Examples with nested arrays: an array element is immediately another array.
+        {_id: id++, c: [[1, 2, 3]]},
+        {_id: id++, c: [[1, 3]]},
+        {_id: id++, c: [[1], [2], [3]]},
+        {_id: id++, c: [[1], [3]]},
+        {_id: id++, c: [[1]]},
+        {_id: id++, c: [[2]]},
+        {_id: id++, c: [[3]]},
+    ]);
+}
+for (let i = 0; i < numDuplicates; ++i) {
+    // Generate more docs where 'Get c Traverse Traverse Eq 2' but not 'Get c Traverse PathArr'.
+    for (let j = 0; j < 10; ++j)
+        docs = docs.concat([
+            {_id: id++, c: 2},
+        ]);
+}
+for (let i = 0; i < numDuplicates; ++i) {
+    // Generate non-matching docs to discourage collection scan.
+    assert.commandWorked(coll.insert(Array.from({length: 100}, () => ({_id: id++}))));
+}
+assert.commandWorked(coll.insert(docs));
+
+assert.commandWorked(coll.createIndex({'a.b': 1}));
+assert.commandWorked(coll.createIndex({'c': 1}));
+
+// Run the pipeline and assert it uses the expected plan.
+// Return the results with the _id field excluded.
+function run({pipeline, plan: expectedPlan}) {
+    const explain = coll.explain().aggregate(pipeline);
+    const plan = getPlanSkeleton(explain.queryPlanner.winningPlan.optimizerPlan, {
+        extraKeepKeys: ['indexDefName', 'interval', 'filter'],
+    });
+    assert.eq(plan, expectedPlan, plan);
+
+    let result = coll.aggregate(pipeline).toArray();
+    for (let doc of result) {
+        delete doc._id;
+    }
+    return result;
+}
+// Assert that 'doc' occurs exactly 'num' times in 'result'.
+function assertCount(result, num, doc) {
+    let matching = result.filter(d => friendlyEqual(d, doc));
+
+    let msg = `Expected ${num} occurrences of ${tojsononeline(doc)} but got ${matching.length}.`;
+    if (matching.length > 0) {
+        msg += ' For example:\n  ' + matching.slice(0, 5).map(d => tojsononeline(d)).join("\n  ");
+    }
+    assert.eq(matching.length, num, msg);
+}
+
+let result;
+
+// Test multikey non-$elemMatch $eq predicate includes non-arrays.
+result = run({
+    pipeline: [{$match: {'a.b': {$eq: 2}}}],
+    plan: {
+        "nodeType": "Root",
+        "child": {
+            "nodeType": "BinaryJoin",
+            "leftChild": {"nodeType": "IndexScan", "indexDefName": "a.b_1", "interval": "[ 2, 2 ]"},
+            "rightChild": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}}
+        }
+    },
+});
+assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]});
+assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]});
+assertCount(result, numDuplicates, {a: {b: 2}, c: 2});
+assert.eq(result.length, numDuplicates * 3);
+
+// Test $elemMatch only matches arrays (unlike the previous query).
+result = run({
+    pipeline: [{$match: {'a.b': {$elemMatch: {$eq: 2}}}}],
+    plan: {
+        "nodeType": "Root",
+        "child": {
+            "nodeType": "Filter",
+            "filter":
+                "fillEmpty(traverseF(getField(scan_0, \"a\"), (inputGet_6 -> let inputComposeM_1 = getField(inputGet_6, \"b\") in if fillEmpty(traverseF(inputComposeM_1, (valCmp_2 -> (valCmp_2 == 2)), false), false) then isArray(inputComposeM_1) else false), false), false)",
+            "child": {
+                "nodeType": "BinaryJoin",
+                "leftChild":
+                    {"nodeType": "IndexScan", "indexDefName": "a.b_1", "interval": "[ 2, 2 ]"},
+                "rightChild": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}}
+            }
+        }
+    },
+});
+assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]});
+assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]});
+assertCount(result, 0, {a: {b: 2}, c: 2});  // Expect zero non-arrays.
+assert.eq(result.length, numDuplicates * 2);
+
+// Test conjunction inside a top-level elemMatch.
+result = run({
+    pipeline: [{$match: {c: {$elemMatch: {$gt: 1, $lt: 3}}}}],
+    plan: {
+        "nodeType": "Root",
+        "child": {
+            "nodeType": "BinaryJoin",
+            "leftChild": {
+                "nodeType": "Unique",
+                "child": {"nodeType": "IndexScan", "indexDefName": "c_1", "interval": "( 1, 3 )"}
+            },
+            "rightChild": {
+                "nodeType": "Filter",
+                "filter": "fillEmpty(isArray(evalTemp_4), false)",
+                "child": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}}
+            }
+        }
+    },
+});
+assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]});
+assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]});
+assert.eq(result.length, numDuplicates * 2);
+
+// Test conjunction inside a dotted elemMatch.
+result = run({
+    pipeline: [{$match: {'a.b': {$elemMatch: {$gt: 1, $lt: 3}}}}],
+    plan: {
+        "nodeType": "Root",
+        "child": {
+            "nodeType": "Filter",
+            "filter":
+                "fillEmpty(traverseF(getField(scan_0, \"a\"), (inputGet_6 -> let inputComposeM_8 = getField(inputGet_6, \"b\") in if fillEmpty(traverseF(inputComposeM_8, (inputComposeM_7 -> if fillEmpty(if fillEmpty(((inputComposeM_7 <=> 3) < NumberLong(0)), false) then ((inputComposeM_7 <=> NaN) >= NumberLong(0)) else false, false) then if fillEmpty(((inputComposeM_7 <=> 1) > NumberLong(0)), false) then ((inputComposeM_7 <=> \"\") < NumberLong(0)) else false else false), false), false) then isArray(inputComposeM_8) else false), false), false)",
+            "child": {
+                "nodeType": "BinaryJoin",
+                "leftChild": {
+                    "nodeType": "Unique",
+                    "child":
+                        {"nodeType": "IndexScan", "indexDefName": "a.b_1", "interval": "( 1, 3 )"}
+                },
+                "rightChild": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}}
+            }
+        }
+    },
+});
+assertCount(result, numDuplicates, {a: {b: [1, 2, 3]}, c: [1, 2, 3]});
+assertCount(result, numDuplicates, {a: {b: [2]}, c: [2]});
+assert.eq(result.length, numDuplicates * 2);
+
+// Nested $elemMatch matches nested arrays, but the bounds only handle PathArr.
+// Multikey indexes don't recursively unwind arrays, so the scalars inside the nested array don't
+// get separate index entries, so we can't generate a tight interval like [2, 2].
+result = run({
+    pipeline: [{$match: {c: {$elemMatch: {$elemMatch: {$eq: 2}}}}}],
+    plan: {
+        "nodeType": "Root",
+        "child": {
+            "nodeType": "Filter",
+            "filter":
+                "fillEmpty(traverseF(getField(scan_0, \"c\"), (inputComposeM_1 -> if fillEmpty(traverseF(inputComposeM_1, (valCmp_3 -> (valCmp_3 == 2)), false), false) then isArray(inputComposeM_1) else false), false), false)",
+            "child": {
+                "nodeType": "BinaryJoin",
+                "leftChild": {
+                    "nodeType": "Unique",
+                    "child": {
+                        "nodeType": "Filter",
+                        "filter":
+                            "fillEmpty(traverseF(evalTemp_6, (valCmp_2 -> (valCmp_2 == 2)), false), false)",
+                        "child": {
+                            "nodeType": "IndexScan",
+                            "indexDefName": "c_1",
+                            "interval": "[ [ ], BinData(0,\"\") )"
+                        }
+                    }
+                },
+                "rightChild": {
+                    "nodeType": "Filter",
+                    "filter": "fillEmpty(isArray(evalTemp_2), false)",
+                    "child": {"nodeType": "LimitSkip", "child": {"nodeType": "Seek"}}
+                }
+            }
+        }
+    },
+});
+assertCount(result, numDuplicates, {c: [[1, 2, 3]]});
+assertCount(result, numDuplicates, {c: [[1], [2], [3]]});
+assertCount(result, numDuplicates, {c: [[2]]});
+assert.eq(result.length, numDuplicates * 3);
+})();
+\ No newline at end of file
diff --git a/jstests/libs/optimizer_utils.js b/jstests/libs/optimizer_utils.js
index 22483624ad9..bd66520a875 100644
--- a/jstests/libs/optimizer_utils.js
+++ b/jstests/libs/optimizer_utils.js
@@ -53,7 +53,9 @@ function leftmostLeafStage(node) {
 /**
  * Get a very simplified version of a plan, which only includes nodeType and nesting structure.
  */
-function getPlanSkeleton(node, recursiveKeepKeys = [], addToKeepKeys = []) {
+function getPlanSkeleton(node, options = {}) {
+    const {extraKeepKeys = [], keepKeysDeep = []} = options;
+
     const keepKeys = [
         'nodeType',
 
@@ -64,26 +66,137 @@ function getPlanSkeleton(node, recursiveKeepKeys = [], addToKeepKeys = []) {
         'children',
         'leftChild',
         'rightChild',
-    ].concat(addToKeepKeys);
+    ].concat(extraKeepKeys);
 
     if (Array.isArray(node)) {
-        return node.map(n => getPlanSkeleton(n));
+        return node.map(n => getPlanSkeleton(n, options));
     } else if (node === null || typeof node !== 'object') {
         return node;
     } else {
         return Object.fromEntries(
             Object.keys(node)
-                .filter(key => (keepKeys.includes(key) || recursiveKeepKeys.includes(key)))
+                .filter(key => (keepKeys.includes(key) || keepKeysDeep.includes(key)))
                 .map(key => {
-                    if (recursiveKeepKeys.includes(key)) {
+                    if (keepKeysDeep.includes(key)) {
                         return [key, node[key]];
+                    } else if (key === 'interval') {
+                        return [key, prettyInterval(node[key])];
+                    } else if (key === 'filter') {
+                        return [key, prettyExpression(node[key])];
                     } else {
-                        return [key, getPlanSkeleton(node[key], recursiveKeepKeys, addToKeepKeys)];
+                        return [key, getPlanSkeleton(node[key], options)];
                     }
                 }));
     }
 }
 
+function prettyInterval(compoundInterval) {
+    // Takes an array of intervals, each one applying to one component of a compound index key.
+    // Try to format it as a string.
+    // If either bound is not Constant, return the original JSON unchanged.
+    let result = '';
+    for (const {lowBound, highBound} of compoundInterval) {
+        if (lowBound.bound.nodeType !== 'Const' || highBound.bound.nodeType !== 'Const') {
+            return compoundInterval;
+        }
+
+        const lowInclusive = lowBound.inclusive;
+        const highInclusive = highBound.inclusive;
+        assert.eq(typeof lowInclusive, 'boolean');
+        assert.eq(typeof highInclusive, 'boolean');
+
+        result += ' ';
+        result += lowInclusive ? '[ ' : '( ';
+        result += tojson(lowBound.bound.value);
+        result += ', ';
+        result += tojson(highBound.bound.value);
+        result += highInclusive ? ' ]' : ' )';
+    }
+    return result.trim();
+}
+
+function prettyExpression(expr) {
+    switch (expr.nodeType) {
+        case 'Variable':
+            return expr.name;
+        case 'Const':
+            return tojson(expr.value);
+        case 'FunctionCall':
+            return `${expr.name}(${expr.arguments.map(a => prettyExpression(a)).join(', ')})`;
+        case 'If': {
+            const if_ = prettyExpression(expr.condition);
+            const then_ = prettyExpression(expr.then);
+            const else_ = prettyExpression(expr.else);
+            return `if ${if_} then ${then_} else ${else_}`;
+        }
+        case 'Let': {
+            const x = expr.variable;
+            const b = prettyExpression(expr.bind);
+            const e = prettyExpression(expr.expression);
+            return `let ${x} = ${b} in ${e}`;
+        }
+        case 'LambdaAbstraction': {
+            return `(${expr.variable} -> ${prettyExpression(expr.input)})`;
+        }
+        case 'BinaryOp': {
+            const left = prettyExpression(expr.left);
+            const right = prettyExpression(expr.right);
+            const op = prettyOp(expr.op);
+            return `(${left} ${op} ${right})`;
+        }
+        default:
+            return tojson(expr);
+    }
+}
+
+function prettyOp(op) {
+    // See src/mongo/db/query/optimizer/syntax/syntax.h, PATHSYNTAX_OPNAMES.
+    switch (op) {
+        /* comparison operations */
+        case 'Eq':
+            return '==';
+        case 'EqMember':
+            return 'in';
+        case 'Neq':
+            return '!=';
+        case 'Gt':
+            return '>';
+        case 'Gte':
+            return '>=';
+        case 'Lt':
+            return '<';
+        case 'Lte':
+            return '<=';
+        case 'Cmp3w':
+            return '<=>';
+
+        /* binary operations */
+        case 'Add':
+            return '+';
+        case 'Sub':
+            return '-';
+        case 'Mult':
+            return '*';
+        case 'Div':
+            return '/';
+
+        /* unary operations */
+        case 'Neg':
+            return '-';
+
+        /* logical operations */
+        case 'And':
+            return 'and';
+        case 'Or':
+            return 'or';
+        case 'Not':
+            return 'not';
+
+        default:
+            return op;
+    }
+}
+
 function navigateToPath(doc, path) {
     let result;
     let field;
diff --git a/jstests/query_golden/expected_output/non_multikey_paths b/jstests/query_golden/expected_output/non_multikey_paths
new file mode 100644
index 00000000000..2d6fa83adb8
--- /dev/null
+++ b/jstests/query_golden/expected_output/non_multikey_paths
@@ -0,0 +1,51 @@
+
+
+[jsTest] ----
+[jsTest] Query: [ { "$match" : { "one.one.one.one" : 2 } } ]
+[jsTest] ----
+
+Leaf stage: {
+	"nodeType" : "IndexScan",
+	"indexDefName" : "one.one.one.one_1",
+	"interval" : "[ 2, 2 ]"
+}
+
+[jsTest] ----
+[jsTest] Query: [ { "$match" : { "one.one.one.many" : 2 } } ]
+[jsTest] ----
+
+Leaf stage: {
+	"nodeType" : "IndexScan",
+	"indexDefName" : "one.one.one.many_1",
+	"interval" : "[ 2, 2 ]"
+}
+
+[jsTest] ----
+[jsTest] Query: [ { "$match" : { "many.one.one.one" : 2 } } ]
+[jsTest] ----
+
+Leaf stage: {
+	"nodeType" : "IndexScan",
+	"indexDefName" : "many.one.one.one_1",
+	"interval" : "[ 2, 2 ]"
+}
+
+[jsTest] ----
+[jsTest] Query: [ { "$match" : { "many.one.one.many" : 2 } } ]
+[jsTest] ----
+
+Leaf stage: {
+	"nodeType" : "IndexScan",
+	"indexDefName" : "many.one.one.many_1",
+	"interval" : "[ 2, 2 ]"
+}
+
+[jsTest] ----
+[jsTest] Query: [ { "$match" : { "many.many.many.many" : 2 } } ]
+[jsTest] ----
+
+Leaf stage: {
+	"nodeType" : "IndexScan",
+	"indexDefName" : "many.many.many.many_1",
+	"interval" : "[ 2, 2 ]"
+}
+\ No newline at end of file
diff --git a/jstests/query_golden/non_multikey_paths.js b/jstests/query_golden/non_multikey_paths.js
new file mode 100644
index 00000000000..f0abdf78249
--- /dev/null
+++ b/jstests/query_golden/non_multikey_paths.js
@@ -0,0 +1,59 @@
+/**
+ * Test indexes with long paths, where some components are multikey and some are not.
+ * Make sure queries can use these indexes, with good bounds.
+ *
+ * @tags: [
+ *   # Checks explain.
+ *   requires_cqf,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/optimizer_utils.js");  // For leftmostLeafStage
+
+const coll = db.cqf_non_multikey_paths;
+coll.drop();
+assert.commandWorked(coll.createIndex({'one.one.one.one': 1}));
+assert.commandWorked(coll.createIndex({'one.one.one.many': 1}));
+assert.commandWorked(coll.createIndex({'many.one.one.one': 1}));
+assert.commandWorked(coll.createIndex({'many.one.one.many': 1}));
+assert.commandWorked(coll.createIndex({'many.many.many.many': 1}));
+let i = 0;
+assert.commandWorked(coll.insert([
+    {_id: ++i, one: {one: {one: {one: 2}}}},
+    {_id: ++i, one: {one: {one: {many: [1, 2, 3]}}}},
+    {
+        _id: ++i,
+        many: [
+            {one: {one: {one: 1}}},
+            {one: {one: {one: 2}}},
+            {one: {one: {one: 3}}},
+        ]
+    },
+    {
+        _id: ++i,
+        many: [
+            {one: {one: {many: [1, 2, 3]}}},
+            {one: {one: {many: [4, 5]}}},
+        ],
+    },
+    {_id: ++i, many: [{many: [{many: [{many: [1, 2, 3]}]}]}]},
+]));
+// Generate enough documents for index to be preferable.
+assert.commandWorked(coll.insert(Array.from({length: 100}, (_, i) => ({_id: i + 1000}))));
+
+function run(pipeline) {
+    jsTestLog(`Query: ${tojsononeline(pipeline)}`);
+    const explain = coll.explain().aggregate(pipeline);
+    print(`Leaf stage: `);
+    const {nodeType, indexDefName, interval} = leftmostLeafStage(explain);
+    printjson({nodeType, indexDefName, interval: prettyInterval(interval)});
+}
+
+run([{$match: {'one.one.one.one': 2}}]);
+run([{$match: {'one.one.one.many': 2}}]);
+run([{$match: {'many.one.one.one': 2}}]);
+run([{$match: {'many.one.one.many': 2}}]);
+run([{$match: {'many.many.many.many': 2}}]);
+})();
+\ No newline at end of file
author	David Percy <david.percy@mongodb.com>	2022-09-26 15:16:37 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-10-26 19:48:19 +0000
commit	136d831a284a5aa7277839c390588ff8f20ab179 (patch)
tree	1a7b6fcf32f003fc614fa2de08f65431e3875d6d /jstests
parent	73fa6daf31440329b55d0971998e8c0359f08392 (diff)
download	mongo-136d831a284a5aa7277839c390588ff8f20ab179.tar.gz