1 files changed, 221 insertions, 0 deletions
diff --git a/src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp b/src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp
new file mode 100644
index 00000000000..7efe1a974ba
--- /dev/null
+++ b/src/mongo/db/query/ce/heuristic_dataflow_nodes_test.cpp
@@ -0,0 +1,221 @@
+/**
+ *    Copyright (C) 2022-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/query/ce/heuristic_estimator.h"
+#include "mongo/db/query/ce/test_utils.h"
+#include "mongo/db/query/optimizer/props.h"
+#include "mongo/db/query/optimizer/utils/unit_test_utils.h"
+#include "mongo/db/query/optimizer/utils/utils.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::optimizer::ce {
+namespace {
+constexpr double kCollCard = 1000.0;
+const std::string kCollName = "test";
+
+constexpr double kOtherCollCard = 200.0;
+const std::string kOtherCollName = "otherTest";
+
+constexpr double kThirdCollCard = 50.0;
+const std::string kThirdCollName = "thirdTest";
+
+class DataflowCETester : public CETester {
+public:
+    DataflowCETester() : CETester(kCollName, kCollCard, kDefaultCETestPhaseSet) {}
+
+protected:
+    std::unique_ptr<cascades::CardinalityEstimator> getEstimator() const override {
+        return std::make_unique<HeuristicEstimator>();
+    }
+};
+
+bool isRootNodeFn(const ABT& node) {
+    return node.is<RootNode>();
+}
+
+TEST(CEDataflowTest, EstimateTrivialNodes) {
+    DataflowCETester t;
+    const auto matchCard = t.getMatchCE("{a: 1}", isRootNodeFn);
+
+    // Verify 'CollationNode' estimate returns the input cardinality.
+    ASSERT_CE(t, "[{$sort: {a: 1}}]", kCollCard);
+    ASSERT_CE(t, "[{$sort: {a: -1, b: 1}}]", kCollCard);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$sort: {a: 1, b: 1}}]", matchCard);
+
+    // Verify 'EvaluationNode' estimate.
+    ASSERT_CE(t, "[{$project: {a: {$add: [\"$a\", 1]}}}]", kCollCard);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$project: {a: {$add: [\"$a\", 1]}}}]", matchCard);
+}
+
+TEST(CEDataflowTest, EstimateUnionNode) {
+    auto makeUnionBranch = [](const std::string& collName) {
+        ProjectionName scanVar{"scan_" + collName};
+        auto scanNode = make<ScanNode>(scanVar, collName);
+        auto evalPath =
+            make<EvalPath>(make<PathGet>("a", make<PathIdentity>()), make<Variable>(scanVar));
+        return make<EvaluationNode>("a", std::move(evalPath), std::move(scanNode));
+    };
+
+    // Verify that the estimate of 'UnionNode' always returns the sum of estimates of its children.
+    // In the following tests we force a simple plan to be generated by passing in a 'manually'
+    // constructed ABT.
+    {
+        DataflowCETester t;
+        t.addCollection(kOtherCollName, kOtherCollCard, {});
+        t.addCollection(kThirdCollName, kThirdCollCard, {});
+        {
+            auto unionNode = make<UnionNode>(
+                ProjectionNameVector{"a"},
+                makeSeq(makeUnionBranch(kCollName), makeUnionBranch(kOtherCollName)));
+            auto rootNode = make<RootNode>(
+                properties::ProjectionRequirement{ProjectionNameVector{"a"}}, std::move(unionNode));
+            ASSERT_CE(t, rootNode, kCollCard + kOtherCollCard);
+        }
+        {
+            auto unionNode = make<UnionNode>(
+                ProjectionNameVector{"a"},
+                makeSeq(makeUnionBranch(kCollName), makeUnionBranch(kOtherCollName)));
+            auto parentUnionNode =
+                make<UnionNode>(ProjectionNameVector{"a"},
+                                makeSeq(std::move(unionNode), makeUnionBranch(kThirdCollName)));
+            auto rootNode =
+                make<RootNode>(properties::ProjectionRequirement{ProjectionNameVector{"a"}},
+                               std::move(parentUnionNode));
+            ASSERT_CE(t, rootNode, kCollCard + kOtherCollCard + kThirdCollCard);
+        }
+    }
+
+    // The following plans include a UnionNode.
+    {
+        DataflowCETester t;
+        t.setCollCard(2000);
+        t.setIndexes(
+            {{"indexA", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ true)}});
+        t.setDisableScan(true);
+        ASSERT_MATCH_CE(t, {"{a: [12]}"}, 1);
+    }
+    {
+        DataflowCETester t;
+        t.setIndexes(
+            {{"indexA", makeIndexDefinition("a", CollationOp::Ascending, /* isMultiKey */ false)},
+             {"indexB", makeIndexDefinition("b", CollationOp::Ascending, /* isMultiKey */ false)}});
+        t.setDisableScan(true);
+        ASSERT_MATCH_CE(t, {"{a: 1, b: 2}"}, 5.62341);
+    }
+}
+
+TEST(CEDataflowTest, EstimateLimitSkipNode) {
+    DataflowCETester t;
+    const CEType matchCard = t.getMatchCE("{a: 1}", isRootNodeFn);
+
+    // Verify that 'LimitSkipNode' estimate with only a limit set is min(limit, inputCE).
+    ASSERT_CE(t, "[{$limit: 1}]", 1.0);
+    ASSERT_CE(t, "[{$limit: 50}]", 50.0);
+    ASSERT_CE(t, "[{$limit: 1000}]", kCollCard);
+    ASSERT_CE(t, "[{$limit: 10000}]", kCollCard);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 1}]", 1.0);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 5}]", 5.0);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 50}]", matchCard);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$limit: 1000}]", matchCard);
+
+    // Verify that 'LimitSkipNode' estimate with only a skip set is max(inputCE - skip, 0).
+    ASSERT_CE(t, "[{$skip: 0}]", kCollCard);
+    ASSERT_CE(t, "[{$skip: 1}]", kCollCard - 1.0);
+    ASSERT_CE(t, "[{$skip: 50}]", kCollCard - 50.0);
+    ASSERT_CE(t, "[{$skip: 1000}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 10000}]", 0.0);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 1}]", matchCard - 1.0);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 5}]", matchCard - 5.0);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 50}]", 0.0);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$skip: 1000}]", 0.0);
+
+    // Test estimates for combinations of $limit & $skip.
+    ASSERT_CE(t, "[{$limit: 1}, {$skip: 1}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 1}, {$limit: 1}]", 1.0);
+    ASSERT_CE(t, "[{$limit: 1}, {$skip: 50}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 50}, {$limit: 1}]", 1.0);
+    ASSERT_CE(t, "[{$limit: 50}, {$skip: 1}]", 49.0);
+    ASSERT_CE(t, "[{$skip: 1}, {$limit: 50}]", 50.0);
+    ASSERT_CE(t, "[{$limit: 50}, {$skip: 50}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 50}, {$limit: 50}]", 50.0);
+    ASSERT_CE(t, "[{$limit: 1000}, {$skip: 50}]", kCollCard - 50.0);
+    ASSERT_CE(t, "[{$skip: 50}, {$limit: 1000}]", kCollCard - 50.0);
+    ASSERT_CE(t, "[{$limit: 50}, {$skip: 1000}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 1000}, {$limit: 50}]", 0.0);
+    ASSERT_CE(t, "[{$limit: 1000}, {$skip: 1000}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 1000}, {$limit: 1000}]", 0.0);
+
+    // Test estimates for combinations of $limit & $skip separated by a $match.
+    ASSERT_CE(t, "[{$limit: 1}, {$match: {a: 1}}, {$skip: 1}]", 0.0);
+    ASSERT_CE(t, "[{$limit: 1}, {$match: {a: 1}}, {$skip: 50}]", 0.0);
+
+    // Input card to $match: 50. $match selectivity here is sqrt(50)/50.
+    ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 1}]", 6.07107);
+    ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 50}]", 0.0);
+    ASSERT_CE(t, "[{$limit: 50}, {$match: {a: 1}}, {$skip: 1000}]", 0.0);
+
+    // Input card to $match is kCollCard. However, our estimate is larger than matchCard because we
+    // have a FilterNode that does not get converted to a SargableNode in this case. The $match
+    // selectivity here is sqrt(1000)/1000.
+    ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 1}]", 30.6228);
+    ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 20}]", 11.6228);
+    ASSERT_CE(t, "[{$limit: 1000}, {$match: {a: 1}}, {$skip: 1000}]", 0.0);
+
+    // Input card to $match: 999. $match selectivity here is sqrt(999)/999.
+    ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 1}]", 1.0);
+    ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 20}]", 20.0);
+    ASSERT_CE(t, "[{$skip: 1}, {$match: {a: 1}}, {$limit: 1000}]", 31.607);
+
+    // Input card to $match: 950. $match selectivity here is sqrt(950)/950.
+    ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 1}]", 1.0);
+    ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 20}]", 20.0);
+    ASSERT_CE(t, "[{$skip: 50}, {$match: {a: 1}}, {$limit: 1000}]", 30.8221);
+
+    // Input card to $match is 0.0.
+    ASSERT_CE(t, "[{$skip: 1000}, {$match: {a: 1}}, {$limit: 50}]", 0.0);
+    ASSERT_CE(t, "[{$skip: 1000}, {$match: {a: 1}}, {$limit: 1000}]", 0.0);
+}
+
+TEST(CEDataflowTest, EstimateUnwindNode) {
+    DataflowCETester t;
+    const CEType matchCard = t.getMatchCE("{a: 1}", isRootNodeFn);
+
+    // We assume that arrays on average have ~10 elements, so we estimate this as inputCard*10.
+    ASSERT_CE(t, "[{$unwind: '$a'}]", 10 * kCollCard);
+    ASSERT_CE(t, "[{$match: {a: 1}}, {$unwind: '$a'}]", 10 * matchCard);
+    ASSERT_CE(t, "[{$unwind: {path: '$a', preserveNullAndEmptyArrays: true}}]", 10 * kCollCard);
+    ASSERT_CE(t,
+              "[{$match: {a: 1}}, {$unwind: {path: '$a', preserveNullAndEmptyArrays: true}}]",
+              10 * matchCard);
+
+    // TODO SERVER-70035: implement histogram estimation of $unwind.
+}
+
+}  // namespace
+}  // namespace mongo::optimizer::ce