SERVER-67264 Avoid COLUMN_SCAN if query needs overlapping fields

author: Charlie Swanson <charlie.swanson@mongodb.com> 2022-08-16 19:53:04 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-08-16 21:53:42 +0000
commit: 39e4b0c6ac51e8127c2afdaf7cc39742df6df5e1 (patch)
tree: b2c16235af0a337eef16841ab4f8a1dd72743567 /src/mongo
parent: 5e77e5fbd3871c65ee20ade0a0cf1410c6b47a9c (diff)
download: mongo-39e4b0c6ac51e8127c2afdaf7cc39742df6df5e1.tar.gz
8 files changed, 199 insertions, 20 deletions
diff --git a/src/mongo/db/matcher/expression_algo.cpp b/src/mongo/db/matcher/expression_algo.cpp
index 90453c66db0..ba286cd4323 100644
--- a/src/mongo/db/matcher/expression_algo.cpp
+++ b/src/mongo/db/matcher/expression_algo.cpp
@@ -796,6 +796,22 @@ bool containsDependency(const OrderedPathSet& testSet, const OrderedPathSet& pre
     return false;
 }
 
+bool containsOverlappingPaths(const OrderedPathSet& testSet) {
+    // We will take advantage of the fact that paths with common ancestors are ordered together in
+    // our ordering. Thus if there are any paths that contain a common ancestor, they will be right
+    // next to each other - unless there are multiple pairs, in which case at least one pair will be
+    // right next to each other.
+    if (testSet.empty()) {
+        return false;
+    }
+    for (auto it = std::next(testSet.begin()); it != testSet.end(); ++it) {
+        if (isPathPrefixOf(*std::prev(it), *it)) {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool areIndependent(const OrderedPathSet& pathSet1, const OrderedPathSet& pathSet2) {
     return !containsDependency(pathSet1, pathSet2) && !containsDependency(pathSet2, pathSet1);
 }
diff --git a/src/mongo/db/matcher/expression_algo.h b/src/mongo/db/matcher/expression_algo.h
index 814974ba881..2e310a52d95 100644
--- a/src/mongo/db/matcher/expression_algo.h
+++ b/src/mongo/db/matcher/expression_algo.h
@@ -98,12 +98,20 @@ bool isSplittableBy(const MatchExpression& expr, const OrderedPathSet& pathSet);
 bool areIndependent(const OrderedPathSet& pathSet1, const OrderedPathSet& pathSet2);
 
 /**
- * Return true if any of the fieldPaths in prefixCandidates are identical to or an ancestor of any
- * of the fieldpaths in testSet.  The order of the parameters matters -- it's not commutative.
+ * Return true if any of the paths in 'prefixCandidates' are identical to or an ancestor of any
+ * of the paths in 'testSet'.  The order of the parameters matters -- it's not commutative.
  */
 bool containsDependency(const OrderedPathSet& testSet, const OrderedPathSet& prefixCandidates);
 
 /**
+ * Returns true if any of the paths in 'testSet' are an ancestor of any of the other paths in
+ * 'testSet'. Examples:
+ * containsOverlappingPaths([a.b, a]) --> true
+ * containsOverlappingPaths([ab, a, a-b]) --> false
+ */
+bool containsOverlappingPaths(const OrderedPathSet& testSet);
+
+/**
  * Determine if 'expr' is reliant upon any path from 'pathSet'.
  */
 bool isIndependentOf(const MatchExpression& expr, const OrderedPathSet& pathSet);
diff --git a/src/mongo/db/matcher/expression_algo_test.cpp b/src/mongo/db/matcher/expression_algo_test.cpp
index 73aa479bd2a..96117ff162a 100644
--- a/src/mongo/db/matcher/expression_algo_test.cpp
+++ b/src/mongo/db/matcher/expression_algo_test.cpp
@@ -950,6 +950,25 @@ TEST(IsIndependent, EmptyDependencySetsPassIsOnlyDependentOn) {
     ASSERT_TRUE(expression::isOnlyDependentOn(*matchExpression.get(), {}));
 }
 
+TEST(ContainsOverlappingPaths, Basics) {
+    // No overlap cases.
+    ASSERT_FALSE(expression::containsOverlappingPaths({}));
+    ASSERT_FALSE(expression::containsOverlappingPaths({"a"}));
+    ASSERT_FALSE(expression::containsOverlappingPaths({"a.b"}));
+    ASSERT_FALSE(expression::containsOverlappingPaths({"a", "b"}));
+    ASSERT_FALSE(expression::containsOverlappingPaths({"a", "ab", "a-b"}));
+    ASSERT_FALSE(expression::containsOverlappingPaths({"a.b", "ab", "a-b"}));
+    ASSERT_FALSE(expression::containsOverlappingPaths({"a.b", "a.c", "a.d"}));
+    ASSERT_FALSE(
+        expression::containsOverlappingPaths({"users.address.zip", "users.address.state"}));
+
+    // Overlap cases
+    ASSERT_TRUE(expression::containsOverlappingPaths({"a.b", "ab", "a-b", "a"}));
+    ASSERT_TRUE(expression::containsOverlappingPaths({"a.b", "ab", "a-b", "a.b.c"}));
+    ASSERT_TRUE(expression::containsOverlappingPaths({"a", "ab", "a-b", "a.b.c"}));
+    ASSERT_TRUE(expression::containsOverlappingPaths({"users.address", "users.address.state"}));
+}
+
 TEST(SplitMatchExpression, AndWithSplittableChildrenIsSplittable) {
     BSONObj matchPredicate = fromjson("{$and: [{a: 1}, {b: 1}]}");
     boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest());
diff --git a/src/mongo/db/query/query_planner.cpp b/src/mongo/db/query/query_planner.cpp
index 059ba11244f..9c05a0b3cf9 100644
--- a/src/mongo/db/query/query_planner.cpp
+++ b/src/mongo/db/query/query_planner.cpp
@@ -28,6 +28,7 @@
  */
 
 
+#include "mongo/db/pipeline/dependencies.h"
 #include "mongo/platform/basic.h"
 
 #include <boost/optional.hpp>
@@ -223,7 +224,8 @@ std::pair<DepsTracker, DepsTracker> computeDeps(const QueryPlannerParams& params
     return {std::move(filterDeps), std::move(outputDeps)};
 }
 
-Status columnScanIsPossibleStatus(const CanonicalQuery& query, const QueryPlannerParams& params) {
+Status computeColumnScanIsPossibleStatus(const CanonicalQuery& query,
+                                         const QueryPlannerParams& params) {
     if (params.columnStoreIndexes.empty()) {
         return {ErrorCodes::InvalidOptions, "No columnstore indexes available"};
     }
@@ -241,7 +243,7 @@ Status columnScanIsPossibleStatus(const CanonicalQuery& query, const QueryPlanne
 }
 
 bool columnScanIsPossible(const CanonicalQuery& query, const QueryPlannerParams& params) {
-    return columnScanIsPossibleStatus(query, params).isOK();
+    return computeColumnScanIsPossibleStatus(query, params).isOK();
 }
 
 std::unique_ptr<QuerySolution> makeColumnScanPlan(
@@ -250,6 +252,7 @@ std::unique_ptr<QuerySolution> makeColumnScanPlan(
     const ColumnIndexEntry& columnStoreIndex,
     DepsTracker filterDeps,
     DepsTracker outputDeps,
+    OrderedPathSet allFieldsReferenced,
     StringMap<std::unique_ptr<MatchExpression>> filterSplitByColumn,
     std::unique_ptr<MatchExpression> residualPredicate) {
     dassert(columnScanIsPossible(query, params));
@@ -261,6 +264,7 @@ std::unique_ptr<QuerySolution> makeColumnScanPlan(
         std::make_unique<ColumnIndexScanNode>(columnStoreIndex,
                                               std::move(outputDeps.fields),
                                               std::move(filterDeps.fields),
+                                              std::move(allFieldsReferenced),
                                               std::move(filterSplitByColumn),
                                               std::move(residualPredicate)));
 }
@@ -269,16 +273,14 @@ std::unique_ptr<QuerySolution> makeColumnScanPlan(
  * A helper function which applies a heuristic to determine if a COLUMN_SCAN plan would examine few
  * enough fields to be considered faster than a COLLSCAN.
  */
-Status checkFieldLimits(const OrderedPathSet& filterDeps,
-                        const OrderedPathSet& outputDeps,
-                        const StringMap<std::unique_ptr<MatchExpression>>& filterSplitByColumn) {
+Status checkColumnScanFieldLimits(
+    size_t nReferencedFields,
+    const StringMap<std::unique_ptr<MatchExpression>>& filterSplitByColumn) {
 
-    const int nReferencedFields =
-        static_cast<int>(set_util::setUnion(filterDeps, outputDeps).size());
     const int maxNumFields = filterSplitByColumn.size() > 0
         ? internalQueryMaxNumberOfFieldsToChooseFilteredColumnScan.load()
         : internalQueryMaxNumberOfFieldsToChooseUnfilteredColumnScan.load();
-    if (nReferencedFields > maxNumFields) {
+    if (static_cast<int>(nReferencedFields) > maxNumFields) {
         return Status{ErrorCodes::Error{6430508},
                       str::stream() << "referenced too many fields. nReferenced="
                                     << nReferencedFields << ", limit=" << maxNumFields};
@@ -294,7 +296,7 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan(
     const QueryPlannerParams& params,
     const CanonicalQuery& query,
     const boost::optional<ColumnIndexEntry>& hintedIndex = boost::none) {
-    if (auto status = columnScanIsPossibleStatus(query, params); !status.isOK()) {
+    if (auto status = computeColumnScanIsPossibleStatus(query, params); !status.isOK()) {
         return status;
     }
 
@@ -311,13 +313,23 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan(
     }
 
     auto [filterDeps, outputDeps] = computeDeps(params, query);
+    auto allFieldsReferenced = set_util::setUnion(filterDeps.fields, outputDeps.fields);
     if (filterDeps.needWholeDocument || outputDeps.needWholeDocument) {
-        // We only want to use the columnar index if we can avoid fetching the whole document.
         // TODO SERVER-66284 Would like to enable a plan when hinted, even if we need the whole
         // document. Something like COLUMN_SCAN -> FETCH.
         return {ErrorCodes::Error{6298501},
                 "cannot use columnstore index because the query requires seeing the entire "
                 "document"};
+    } else if (!hintedIndex && expression::containsOverlappingPaths(allFieldsReferenced)) {
+        // The query needs a path and a parent or ancestor path. For example, the query needs to
+        // access both "a" and "a.b". This is a heuristic, but generally we would not expect this to
+        // benefit from the column store index. This kind of dependency pattern is probably an
+        // indication that the parent/ancestor path will be an object or array of objects, which
+        // will require us to fall back to the rowstore and remove any benefit of using the index.
+        return {ErrorCodes::Error{6726400},
+                str::stream() << "cannot use columnstore index because the query requires paths "
+                                 "which are a prefix of each other: "
+                              << set_util::setToString(allFieldsReferenced)};
     }
 
     // TODO SERVER-67140: Check if the columnar index actually provides the fields we need.
@@ -326,7 +338,7 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan(
     std::tie(filterSplitByColumn, residualPredicate) =
         expression::splitMatchExpressionForColumns(query.root());
     auto fieldLimitStatus =
-        checkFieldLimits(filterDeps.fields, outputDeps.fields, filterSplitByColumn);
+        checkColumnScanFieldLimits(allFieldsReferenced.size(), filterSplitByColumn);
 
     if (fieldLimitStatus.isOK() || hintedIndex) {
         // We have a hint, or few enough dependencies that we suspect a column scan is still
@@ -336,6 +348,7 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan(
                                   columnStoreIndex,
                                   std::move(filterDeps),
                                   std::move(outputDeps),
+                                  std::move(allFieldsReferenced),
                                   std::move(filterSplitByColumn),
                                   std::move(residualPredicate));
     }
@@ -1444,8 +1457,12 @@ StatusWith<std::vector<std::unique_ptr<QuerySolution>>> QueryPlanner::plan(
     // Check whether we're eligible to use the columnar index, assuming no other indexes can be
     // used.
     if (out.empty()) {
-        if (auto statusWithSoln = tryToBuildColumnScan(params, query); statusWithSoln.isOK()) {
+        auto statusWithSoln = tryToBuildColumnScan(params, query);
+        if (statusWithSoln.isOK()) {
             out.emplace_back(std::move(statusWithSoln.getValue()));
+        } else {
+            LOGV2_DEBUG(
+                6726401, 4, "Not using a column scan", "reason"_attr = statusWithSoln.getStatus());
         }
     }
 
diff --git a/src/mongo/db/query/query_planner_columnar_test.cpp b/src/mongo/db/query/query_planner_columnar_test.cpp
index 914b9ae76f4..490164ee87b 100644
--- a/src/mongo/db/query/query_planner_columnar_test.cpp
+++ b/src/mongo/db/query/query_planner_columnar_test.cpp
@@ -280,6 +280,110 @@ TEST_F(QueryPlannerColumnarTest,
     assertSolutionExists(R"({proj: {spec: {a: 1, b: 1, c: 1}, node: {cscan: {dir: 1}}}})");
 }
 
+// Tests that a query which depends on overlapping parent/child fields like 'a.b' and 'a' will not
+// use the column store index.
+TEST_F(QueryPlannerColumnarTest, QueryWithOverlappingDependenciesDoesNotUseColumnarIndex) {
+    addColumnStoreIndexAndEnableFilterSplitting();
+
+    runQuerySortProj(BSONObj(), BSON("a.b" << 1 << "a.c" << 1), BSON("a" << 1));
+    assertNumSolutions(1U);
+    assertSolutionExists(R"({
+        sort: {
+            pattern: {"a.b": 1, "a.c": 1},
+            limit: 0,
+            node: {
+                proj: {
+                    spec: {a: 1},
+                    node: {
+                        cscan: {dir: 1}
+                    }
+                }
+            }
+        }
+    })");
+}
+
+TEST_F(QueryPlannerColumnarTest, QueryWithConflictingAncestralDependenciesDoesNotUseColumnarIndex) {
+    addColumnStoreIndexAndEnableFilterSplitting();
+
+    runQuerySortProj(BSONObj(), BSON("a.b.c" << 1), BSON("a" << 1));
+    assertNumSolutions(1U);
+    assertSolutionExists(R"({
+        sort: {
+            pattern: {"a.b.c": 1},
+            limit: 0,
+            node: {
+                proj: {
+                    spec: {a: 1},
+                    node: {
+                        cscan: {dir: 1}
+                    }
+                }
+            }
+        }
+    })");
+}
+
+// Test like those above, but proving that we do the prefix detection correctly and don't mistake
+// regular (non-path) prefixes.
+TEST_F(QueryPlannerColumnarTest, QueryWithSimilarDependenciesDoesUseColumnarIndex) {
+    addColumnStoreIndexAndEnableFilterSplitting();
+
+    runQuerySortProj(BSONObj(), BSON("abc" << 1), BSON("a" << 1));
+    assertNumSolutions(1U);
+    assertSolutionExists(R"({
+        proj: {
+            spec: {a: 1, _id: 1},
+            node: {
+                sort: {
+                    pattern: {"abc": 1},
+                    limit: 0,
+                    node: {
+                        column_scan: {
+                            filtersByPath: {},
+                            outputFields: ['_id', 'a', 'abc'],
+                            matchFields: []
+                        }
+                    }
+                }
+            }
+        }
+    })");
+}
+
+// Test that adding a hint will allow you to use the column store index for a query with overlapping
+// parent/child dependencies.
+TEST_F(QueryPlannerColumnarTest, HintOverridesOverlappingFieldsCheck) {
+    addColumnStoreIndexAndEnableFilterSplitting();
+
+    runQuerySortProjSkipLimitHint(BSONObj(),
+                                  BSON("a.b.c" << 1),
+                                  BSON("a" << 1),
+                                  0,
+                                  0,
+                                  BSON("$**"
+                                       << "columnstore"));
+    assertNumSolutions(1U);
+    assertSolutionExists(R"({
+        sort: {
+            pattern: {"a.b.c": 1},
+            limit: 0,
+            node: {
+                proj: {
+                    spec: {a: 1, _id: 1},
+                    node: {
+                        column_scan: {
+                            filtersByPath: {},
+                            outputFields: ['_id', 'a', 'a.b.c'],
+                            matchFields: []
+                        }
+                    }
+                }
+            }
+        }
+    })");
+}
+
 TEST_F(QueryPlannerColumnarTest, HintOverridesFieldLimitUnfiltered) {
     addColumnStoreIndexAndEnableFilterSplitting();
     internalQueryMaxNumberOfFieldsToChooseUnfilteredColumnScan.store(2);
diff --git a/src/mongo/db/query/query_solution.cpp b/src/mongo/db/query/query_solution.cpp
index f1f2295a727..32653070dea 100644
--- a/src/mongo/db/query/query_solution.cpp
+++ b/src/mongo/db/query/query_solution.cpp
@@ -1100,17 +1100,15 @@ bool IndexScanNode::operator==(const IndexScanNode& other) const {
 ColumnIndexScanNode::ColumnIndexScanNode(ColumnIndexEntry indexEntry,
                                          OrderedPathSet outputFieldsIn,
                                          OrderedPathSet matchFieldsIn,
+                                         OrderedPathSet allFieldsIn,
                                          StringMap<std::unique_ptr<MatchExpression>> filtersByPath,
                                          std::unique_ptr<MatchExpression> postAssemblyFilter)
     : indexEntry(std::move(indexEntry)),
       outputFields(std::move(outputFieldsIn)),
       matchFields(std::move(matchFieldsIn)),
-      allFields(set_util::setUnion(outputFields, matchFields)),
+      allFields(std::move(allFieldsIn)),
       filtersByPath(std::move(filtersByPath)),
-      postAssemblyFilter(std::move(postAssemblyFilter)) {
-    allFields = outputFields;
-    allFields.insert(matchFields.begin(), matchFields.end());
-}
+      postAssemblyFilter(std::move(postAssemblyFilter)) {}
 
 void ColumnIndexScanNode::appendToString(str::stream* ss, int indent) const {
     addIndent(ss, indent);
diff --git a/src/mongo/db/query/query_solution.h b/src/mongo/db/query/query_solution.h
index dc84b041fb1..56a6eb92bf4 100644
--- a/src/mongo/db/query/query_solution.h
+++ b/src/mongo/db/query/query_solution.h
@@ -513,6 +513,7 @@ struct ColumnIndexScanNode : public QuerySolutionNode {
     ColumnIndexScanNode(ColumnIndexEntry,
                         OrderedPathSet outputFields,
                         OrderedPathSet matchFields,
+                        OrderedPathSet allFields,
                         StringMap<std::unique_ptr<MatchExpression>> filtersByPath,
                         std::unique_ptr<MatchExpression> postAssemblyFilter);
 
@@ -545,6 +546,7 @@ struct ColumnIndexScanNode : public QuerySolutionNode {
         return std::make_unique<ColumnIndexScanNode>(indexEntry,
                                                      outputFields,
                                                      matchFields,
+                                                     allFields,
                                                      std::move(clonedFiltersByPath),
                                                      postAssemblyFilter->shallowClone());
     }
diff --git a/src/mongo/db/query/util/set_util.h b/src/mongo/db/query/util/set_util.h
index ab513eead71..a68f9a581f1 100644
--- a/src/mongo/db/query/util/set_util.h
+++ b/src/mongo/db/query/util/set_util.h
@@ -27,10 +27,12 @@
  *    it in the license file.
  */
 
+#pragma once
+
 #include <algorithm>
 #include <set>
 
-#pragma once
+#include "mongo/bson/util/builder_fwd.h"
 
 namespace mongo::set_util {
 
@@ -43,4 +45,17 @@ std::set<T, Comparator> setUnion(const std::set<T, Comparator>& set1,
     return out;
 }
 
+template <typename T, typename Comparator>
+std::string setToString(const std::set<T, Comparator>& set) {
+    StackStringBuilder sb;
+    sb << "{";
+    for (auto it = set.begin(); it != set.end(); ++it) {
+        sb << *it;
+        if (std::next(it) != set.end())
+            sb << ",";
+    }
+    sb << "}";
+    return sb.str();
+}
+
 }  // namespace mongo::set_util
author	Charlie Swanson <charlie.swanson@mongodb.com>	2022-08-16 19:53:04 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-08-16 21:53:42 +0000
commit	39e4b0c6ac51e8127c2afdaf7cc39742df6df5e1 (patch)
tree	b2c16235af0a337eef16841ab4f8a1dd72743567 /src/mongo
parent	5e77e5fbd3871c65ee20ade0a0cf1410c6b47a9c (diff)
download	mongo-39e4b0c6ac51e8127c2afdaf7cc39742df6df5e1.tar.gz