diff options
author | Charlie Swanson <charlie.swanson@mongodb.com> | 2022-08-16 19:53:04 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-08-16 21:53:42 +0000 |
commit | 39e4b0c6ac51e8127c2afdaf7cc39742df6df5e1 (patch) | |
tree | b2c16235af0a337eef16841ab4f8a1dd72743567 /src/mongo | |
parent | 5e77e5fbd3871c65ee20ade0a0cf1410c6b47a9c (diff) | |
download | mongo-39e4b0c6ac51e8127c2afdaf7cc39742df6df5e1.tar.gz |
SERVER-67264 Avoid COLUMN_SCAN if query needs overlapping fields
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/matcher/expression_algo.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/matcher/expression_algo.h | 12 | ||||
-rw-r--r-- | src/mongo/db/matcher/expression_algo_test.cpp | 19 | ||||
-rw-r--r-- | src/mongo/db/query/query_planner.cpp | 41 | ||||
-rw-r--r-- | src/mongo/db/query/query_planner_columnar_test.cpp | 104 | ||||
-rw-r--r-- | src/mongo/db/query/query_solution.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/query/query_solution.h | 2 | ||||
-rw-r--r-- | src/mongo/db/query/util/set_util.h | 17 |
8 files changed, 199 insertions, 20 deletions
diff --git a/src/mongo/db/matcher/expression_algo.cpp b/src/mongo/db/matcher/expression_algo.cpp index 90453c66db0..ba286cd4323 100644 --- a/src/mongo/db/matcher/expression_algo.cpp +++ b/src/mongo/db/matcher/expression_algo.cpp @@ -796,6 +796,22 @@ bool containsDependency(const OrderedPathSet& testSet, const OrderedPathSet& pre return false; } +bool containsOverlappingPaths(const OrderedPathSet& testSet) { + // We will take advantage of the fact that paths with common ancestors are ordered together in + // our ordering. Thus if there are any paths that contain a common ancestor, they will be right + // next to each other - unless there are multiple pairs, in which case at least one pair will be + // right next to each other. + if (testSet.empty()) { + return false; + } + for (auto it = std::next(testSet.begin()); it != testSet.end(); ++it) { + if (isPathPrefixOf(*std::prev(it), *it)) { + return true; + } + } + return false; +} + bool areIndependent(const OrderedPathSet& pathSet1, const OrderedPathSet& pathSet2) { return !containsDependency(pathSet1, pathSet2) && !containsDependency(pathSet2, pathSet1); } diff --git a/src/mongo/db/matcher/expression_algo.h b/src/mongo/db/matcher/expression_algo.h index 814974ba881..2e310a52d95 100644 --- a/src/mongo/db/matcher/expression_algo.h +++ b/src/mongo/db/matcher/expression_algo.h @@ -98,12 +98,20 @@ bool isSplittableBy(const MatchExpression& expr, const OrderedPathSet& pathSet); bool areIndependent(const OrderedPathSet& pathSet1, const OrderedPathSet& pathSet2); /** - * Return true if any of the fieldPaths in prefixCandidates are identical to or an ancestor of any - * of the fieldpaths in testSet. The order of the parameters matters -- it's not commutative. + * Return true if any of the paths in 'prefixCandidates' are identical to or an ancestor of any + * of the paths in 'testSet'. The order of the parameters matters -- it's not commutative. */ bool containsDependency(const OrderedPathSet& testSet, const OrderedPathSet& prefixCandidates); /** + * Returns true if any of the paths in 'testSet' are an ancestor of any of the other paths in + * 'testSet'. Examples: + * containsOverlappingPaths([a.b, a]) --> true + * containsOverlappingPaths([ab, a, a-b]) --> false + */ +bool containsOverlappingPaths(const OrderedPathSet& testSet); + +/** * Determine if 'expr' is reliant upon any path from 'pathSet'. */ bool isIndependentOf(const MatchExpression& expr, const OrderedPathSet& pathSet); diff --git a/src/mongo/db/matcher/expression_algo_test.cpp b/src/mongo/db/matcher/expression_algo_test.cpp index 73aa479bd2a..96117ff162a 100644 --- a/src/mongo/db/matcher/expression_algo_test.cpp +++ b/src/mongo/db/matcher/expression_algo_test.cpp @@ -950,6 +950,25 @@ TEST(IsIndependent, EmptyDependencySetsPassIsOnlyDependentOn) { ASSERT_TRUE(expression::isOnlyDependentOn(*matchExpression.get(), {})); } +TEST(ContainsOverlappingPaths, Basics) { + // No overlap cases. + ASSERT_FALSE(expression::containsOverlappingPaths({})); + ASSERT_FALSE(expression::containsOverlappingPaths({"a"})); + ASSERT_FALSE(expression::containsOverlappingPaths({"a.b"})); + ASSERT_FALSE(expression::containsOverlappingPaths({"a", "b"})); + ASSERT_FALSE(expression::containsOverlappingPaths({"a", "ab", "a-b"})); + ASSERT_FALSE(expression::containsOverlappingPaths({"a.b", "ab", "a-b"})); + ASSERT_FALSE(expression::containsOverlappingPaths({"a.b", "a.c", "a.d"})); + ASSERT_FALSE( + expression::containsOverlappingPaths({"users.address.zip", "users.address.state"})); + + // Overlap cases + ASSERT_TRUE(expression::containsOverlappingPaths({"a.b", "ab", "a-b", "a"})); + ASSERT_TRUE(expression::containsOverlappingPaths({"a.b", "ab", "a-b", "a.b.c"})); + ASSERT_TRUE(expression::containsOverlappingPaths({"a", "ab", "a-b", "a.b.c"})); + ASSERT_TRUE(expression::containsOverlappingPaths({"users.address", "users.address.state"})); +} + TEST(SplitMatchExpression, AndWithSplittableChildrenIsSplittable) { BSONObj matchPredicate = fromjson("{$and: [{a: 1}, {b: 1}]}"); boost::intrusive_ptr<ExpressionContextForTest> expCtx(new ExpressionContextForTest()); diff --git a/src/mongo/db/query/query_planner.cpp b/src/mongo/db/query/query_planner.cpp index 059ba11244f..9c05a0b3cf9 100644 --- a/src/mongo/db/query/query_planner.cpp +++ b/src/mongo/db/query/query_planner.cpp @@ -28,6 +28,7 @@ */ +#include "mongo/db/pipeline/dependencies.h" #include "mongo/platform/basic.h" #include <boost/optional.hpp> @@ -223,7 +224,8 @@ std::pair<DepsTracker, DepsTracker> computeDeps(const QueryPlannerParams& params return {std::move(filterDeps), std::move(outputDeps)}; } -Status columnScanIsPossibleStatus(const CanonicalQuery& query, const QueryPlannerParams& params) { +Status computeColumnScanIsPossibleStatus(const CanonicalQuery& query, + const QueryPlannerParams& params) { if (params.columnStoreIndexes.empty()) { return {ErrorCodes::InvalidOptions, "No columnstore indexes available"}; } @@ -241,7 +243,7 @@ Status columnScanIsPossibleStatus(const CanonicalQuery& query, const QueryPlanne } bool columnScanIsPossible(const CanonicalQuery& query, const QueryPlannerParams& params) { - return columnScanIsPossibleStatus(query, params).isOK(); + return computeColumnScanIsPossibleStatus(query, params).isOK(); } std::unique_ptr<QuerySolution> makeColumnScanPlan( @@ -250,6 +252,7 @@ std::unique_ptr<QuerySolution> makeColumnScanPlan( const ColumnIndexEntry& columnStoreIndex, DepsTracker filterDeps, DepsTracker outputDeps, + OrderedPathSet allFieldsReferenced, StringMap<std::unique_ptr<MatchExpression>> filterSplitByColumn, std::unique_ptr<MatchExpression> residualPredicate) { dassert(columnScanIsPossible(query, params)); @@ -261,6 +264,7 @@ std::unique_ptr<QuerySolution> makeColumnScanPlan( std::make_unique<ColumnIndexScanNode>(columnStoreIndex, std::move(outputDeps.fields), std::move(filterDeps.fields), + std::move(allFieldsReferenced), std::move(filterSplitByColumn), std::move(residualPredicate))); } @@ -269,16 +273,14 @@ std::unique_ptr<QuerySolution> makeColumnScanPlan( * A helper function which applies a heuristic to determine if a COLUMN_SCAN plan would examine few * enough fields to be considered faster than a COLLSCAN. */ -Status checkFieldLimits(const OrderedPathSet& filterDeps, - const OrderedPathSet& outputDeps, - const StringMap<std::unique_ptr<MatchExpression>>& filterSplitByColumn) { +Status checkColumnScanFieldLimits( + size_t nReferencedFields, + const StringMap<std::unique_ptr<MatchExpression>>& filterSplitByColumn) { - const int nReferencedFields = - static_cast<int>(set_util::setUnion(filterDeps, outputDeps).size()); const int maxNumFields = filterSplitByColumn.size() > 0 ? internalQueryMaxNumberOfFieldsToChooseFilteredColumnScan.load() : internalQueryMaxNumberOfFieldsToChooseUnfilteredColumnScan.load(); - if (nReferencedFields > maxNumFields) { + if (static_cast<int>(nReferencedFields) > maxNumFields) { return Status{ErrorCodes::Error{6430508}, str::stream() << "referenced too many fields. nReferenced=" << nReferencedFields << ", limit=" << maxNumFields}; @@ -294,7 +296,7 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan( const QueryPlannerParams& params, const CanonicalQuery& query, const boost::optional<ColumnIndexEntry>& hintedIndex = boost::none) { - if (auto status = columnScanIsPossibleStatus(query, params); !status.isOK()) { + if (auto status = computeColumnScanIsPossibleStatus(query, params); !status.isOK()) { return status; } @@ -311,13 +313,23 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan( } auto [filterDeps, outputDeps] = computeDeps(params, query); + auto allFieldsReferenced = set_util::setUnion(filterDeps.fields, outputDeps.fields); if (filterDeps.needWholeDocument || outputDeps.needWholeDocument) { - // We only want to use the columnar index if we can avoid fetching the whole document. // TODO SERVER-66284 Would like to enable a plan when hinted, even if we need the whole // document. Something like COLUMN_SCAN -> FETCH. return {ErrorCodes::Error{6298501}, "cannot use columnstore index because the query requires seeing the entire " "document"}; + } else if (!hintedIndex && expression::containsOverlappingPaths(allFieldsReferenced)) { + // The query needs a path and a parent or ancestor path. For example, the query needs to + // access both "a" and "a.b". This is a heuristic, but generally we would not expect this to + // benefit from the column store index. This kind of dependency pattern is probably an + // indication that the parent/ancestor path will be an object or array of objects, which + // will require us to fall back to the rowstore and remove any benefit of using the index. + return {ErrorCodes::Error{6726400}, + str::stream() << "cannot use columnstore index because the query requires paths " + "which are a prefix of each other: " + << set_util::setToString(allFieldsReferenced)}; } // TODO SERVER-67140: Check if the columnar index actually provides the fields we need. @@ -326,7 +338,7 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan( std::tie(filterSplitByColumn, residualPredicate) = expression::splitMatchExpressionForColumns(query.root()); auto fieldLimitStatus = - checkFieldLimits(filterDeps.fields, outputDeps.fields, filterSplitByColumn); + checkColumnScanFieldLimits(allFieldsReferenced.size(), filterSplitByColumn); if (fieldLimitStatus.isOK() || hintedIndex) { // We have a hint, or few enough dependencies that we suspect a column scan is still @@ -336,6 +348,7 @@ StatusWith<std::unique_ptr<QuerySolution>> tryToBuildColumnScan( columnStoreIndex, std::move(filterDeps), std::move(outputDeps), + std::move(allFieldsReferenced), std::move(filterSplitByColumn), std::move(residualPredicate)); } @@ -1444,8 +1457,12 @@ StatusWith<std::vector<std::unique_ptr<QuerySolution>>> QueryPlanner::plan( // Check whether we're eligible to use the columnar index, assuming no other indexes can be // used. if (out.empty()) { - if (auto statusWithSoln = tryToBuildColumnScan(params, query); statusWithSoln.isOK()) { + auto statusWithSoln = tryToBuildColumnScan(params, query); + if (statusWithSoln.isOK()) { out.emplace_back(std::move(statusWithSoln.getValue())); + } else { + LOGV2_DEBUG( + 6726401, 4, "Not using a column scan", "reason"_attr = statusWithSoln.getStatus()); } } diff --git a/src/mongo/db/query/query_planner_columnar_test.cpp b/src/mongo/db/query/query_planner_columnar_test.cpp index 914b9ae76f4..490164ee87b 100644 --- a/src/mongo/db/query/query_planner_columnar_test.cpp +++ b/src/mongo/db/query/query_planner_columnar_test.cpp @@ -280,6 +280,110 @@ TEST_F(QueryPlannerColumnarTest, assertSolutionExists(R"({proj: {spec: {a: 1, b: 1, c: 1}, node: {cscan: {dir: 1}}}})"); } +// Tests that a query which depends on overlapping parent/child fields like 'a.b' and 'a' will not +// use the column store index. +TEST_F(QueryPlannerColumnarTest, QueryWithOverlappingDependenciesDoesNotUseColumnarIndex) { + addColumnStoreIndexAndEnableFilterSplitting(); + + runQuerySortProj(BSONObj(), BSON("a.b" << 1 << "a.c" << 1), BSON("a" << 1)); + assertNumSolutions(1U); + assertSolutionExists(R"({ + sort: { + pattern: {"a.b": 1, "a.c": 1}, + limit: 0, + node: { + proj: { + spec: {a: 1}, + node: { + cscan: {dir: 1} + } + } + } + } + })"); +} + +TEST_F(QueryPlannerColumnarTest, QueryWithConflictingAncestralDependenciesDoesNotUseColumnarIndex) { + addColumnStoreIndexAndEnableFilterSplitting(); + + runQuerySortProj(BSONObj(), BSON("a.b.c" << 1), BSON("a" << 1)); + assertNumSolutions(1U); + assertSolutionExists(R"({ + sort: { + pattern: {"a.b.c": 1}, + limit: 0, + node: { + proj: { + spec: {a: 1}, + node: { + cscan: {dir: 1} + } + } + } + } + })"); +} + +// Test like those above, but proving that we do the prefix detection correctly and don't mistake +// regular (non-path) prefixes. +TEST_F(QueryPlannerColumnarTest, QueryWithSimilarDependenciesDoesUseColumnarIndex) { + addColumnStoreIndexAndEnableFilterSplitting(); + + runQuerySortProj(BSONObj(), BSON("abc" << 1), BSON("a" << 1)); + assertNumSolutions(1U); + assertSolutionExists(R"({ + proj: { + spec: {a: 1, _id: 1}, + node: { + sort: { + pattern: {"abc": 1}, + limit: 0, + node: { + column_scan: { + filtersByPath: {}, + outputFields: ['_id', 'a', 'abc'], + matchFields: [] + } + } + } + } + } + })"); +} + +// Test that adding a hint will allow you to use the column store index for a query with overlapping +// parent/child dependencies. +TEST_F(QueryPlannerColumnarTest, HintOverridesOverlappingFieldsCheck) { + addColumnStoreIndexAndEnableFilterSplitting(); + + runQuerySortProjSkipLimitHint(BSONObj(), + BSON("a.b.c" << 1), + BSON("a" << 1), + 0, + 0, + BSON("$**" + << "columnstore")); + assertNumSolutions(1U); + assertSolutionExists(R"({ + sort: { + pattern: {"a.b.c": 1}, + limit: 0, + node: { + proj: { + spec: {a: 1, _id: 1}, + node: { + column_scan: { + filtersByPath: {}, + outputFields: ['_id', 'a', 'a.b.c'], + matchFields: [] + } + } + } + } + } + })"); +} + TEST_F(QueryPlannerColumnarTest, HintOverridesFieldLimitUnfiltered) { addColumnStoreIndexAndEnableFilterSplitting(); internalQueryMaxNumberOfFieldsToChooseUnfilteredColumnScan.store(2); diff --git a/src/mongo/db/query/query_solution.cpp b/src/mongo/db/query/query_solution.cpp index f1f2295a727..32653070dea 100644 --- a/src/mongo/db/query/query_solution.cpp +++ b/src/mongo/db/query/query_solution.cpp @@ -1100,17 +1100,15 @@ bool IndexScanNode::operator==(const IndexScanNode& other) const { ColumnIndexScanNode::ColumnIndexScanNode(ColumnIndexEntry indexEntry, OrderedPathSet outputFieldsIn, OrderedPathSet matchFieldsIn, + OrderedPathSet allFieldsIn, StringMap<std::unique_ptr<MatchExpression>> filtersByPath, std::unique_ptr<MatchExpression> postAssemblyFilter) : indexEntry(std::move(indexEntry)), outputFields(std::move(outputFieldsIn)), matchFields(std::move(matchFieldsIn)), - allFields(set_util::setUnion(outputFields, matchFields)), + allFields(std::move(allFieldsIn)), filtersByPath(std::move(filtersByPath)), - postAssemblyFilter(std::move(postAssemblyFilter)) { - allFields = outputFields; - allFields.insert(matchFields.begin(), matchFields.end()); -} + postAssemblyFilter(std::move(postAssemblyFilter)) {} void ColumnIndexScanNode::appendToString(str::stream* ss, int indent) const { addIndent(ss, indent); diff --git a/src/mongo/db/query/query_solution.h b/src/mongo/db/query/query_solution.h index dc84b041fb1..56a6eb92bf4 100644 --- a/src/mongo/db/query/query_solution.h +++ b/src/mongo/db/query/query_solution.h @@ -513,6 +513,7 @@ struct ColumnIndexScanNode : public QuerySolutionNode { ColumnIndexScanNode(ColumnIndexEntry, OrderedPathSet outputFields, OrderedPathSet matchFields, + OrderedPathSet allFields, StringMap<std::unique_ptr<MatchExpression>> filtersByPath, std::unique_ptr<MatchExpression> postAssemblyFilter); @@ -545,6 +546,7 @@ struct ColumnIndexScanNode : public QuerySolutionNode { return std::make_unique<ColumnIndexScanNode>(indexEntry, outputFields, matchFields, + allFields, std::move(clonedFiltersByPath), postAssemblyFilter->shallowClone()); } diff --git a/src/mongo/db/query/util/set_util.h b/src/mongo/db/query/util/set_util.h index ab513eead71..a68f9a581f1 100644 --- a/src/mongo/db/query/util/set_util.h +++ b/src/mongo/db/query/util/set_util.h @@ -27,10 +27,12 @@ * it in the license file. */ +#pragma once + #include <algorithm> #include <set> -#pragma once +#include "mongo/bson/util/builder_fwd.h" namespace mongo::set_util { @@ -43,4 +45,17 @@ std::set<T, Comparator> setUnion(const std::set<T, Comparator>& set1, return out; } +template <typename T, typename Comparator> +std::string setToString(const std::set<T, Comparator>& set) { + StackStringBuilder sb; + sb << "{"; + for (auto it = set.begin(); it != set.end(); ++it) { + sb << *it; + if (std::next(it) != set.end()) + sb << ","; + } + sb << "}"; + return sb.str(); +} + } // namespace mongo::set_util |